From 80b497213981512e9ba1a629bcd5e2c519d2e566 Mon Sep 17 00:00:00 2001
From: Emmanuel Hansen <emmausssss@gmail.com>
Date: Mon, 27 Feb 2023 21:11:55 +0000
Subject: [PATCH] Add Support for Post Processing Effects (#3616)

* Add Post Processing Effects

* fix events and shader issues

* fix gtk upscale slider value

* fix bgra games

* don't swap swizzle if already swapped

* restore opengl texture state after effects run

* addressed review

* use single pipeline for smaa and fsr

* call finish on all pipelines

* addressed review

* attempt fix file case

* attempt fixing file case

* fix filter level tick frequency

* adjust filter slider margins

* replace fxaa shaders with original shader

* addressed review
---
 Ryujinx.Ava/AppHost.cs                        |   28 +
 Ryujinx.Ava/Assets/Locales/en_US.json         |   10 +
 .../UI/ViewModels/SettingsViewModel.cs        |   32 +
 .../Views/Settings/SettingsGraphicsView.axaml |   78 +
 Ryujinx.Common/Configuration/AntiAliasing.cs  |   12 +
 Ryujinx.Common/Configuration/ScalingFilter.cs |    9 +
 Ryujinx.Graphics.GAL/AntiAliasing.cs          |   12 +
 Ryujinx.Graphics.GAL/IWindow.cs               |    4 +
 .../Multithreading/ThreadedWindow.cs          |    6 +
 Ryujinx.Graphics.GAL/UpscaleType.cs           |    9 +
 .../Effects/FsrScalingFilter.cs               |  177 +
 .../Effects/FxaaPostProcessingEffect.cs       |   81 +
 .../Effects/IPostProcessingEffect.cs          |   11 +
 .../Effects/IScalingFilter.cs                 |   18 +
 .../Effects/ShaderHelper.cs                   |   40 +
 .../Effects/Shaders/ffx_a.h                   | 2656 +++++++++++
 .../Effects/Shaders/ffx_fsr1.h                | 1199 +++++
 .../Effects/Shaders/fsr_scaling.glsl          |   88 +
 .../Effects/Shaders/fsr_sharpening.glsl       |   37 +
 .../Effects/Shaders/fxaa.glsl                 | 1174 +++++
 .../Effects/Shaders/smaa.hlsl                 | 1361 ++++++
 .../Effects/Shaders/smaa_blend.glsl           |   26 +
 .../Effects/Shaders/smaa_edge.glsl            |   24 +
 .../Effects/Shaders/smaa_neighbour.glsl       |   26 +
 .../Effects/SmaaPostProcessingEffect.cs       |  261 ++
 .../Effects/Textures/SmaaAreaTexture.bin      |  Bin 0 -> 179200 bytes
 .../Effects/Textures/SmaaSearchTexture.bin    |  Bin 0 -> 1024 bytes
 .../Ryujinx.Graphics.OpenGL.csproj            |   14 +
 Ryujinx.Graphics.OpenGL/Window.cs             |  215 +-
 .../DescriptorSetUpdater.cs                   |    7 +
 .../Effects/FsrScalingFilter.cs               |  208 +
 .../Effects/FxaaPostProcessingEffect.cs       |  127 +
 .../Effects/IPostProcessingEffect.cs          |   10 +
 .../Effects/IScalingFilter.cs                 |   20 +
 .../Effects/Shaders/FsrScaling.glsl           | 3945 +++++++++++++++++
 .../Effects/Shaders/FsrScaling.spv            |  Bin 0 -> 44672 bytes
 .../Effects/Shaders/FsrSharpening.glsl        | 3904 ++++++++++++++++
 .../Effects/Shaders/FsrSharpening.spv         |  Bin 0 -> 20472 bytes
 .../Effects/Shaders/Fxaa.glsl                 | 1177 +++++
 .../Effects/Shaders/Fxaa.spv                  |  Bin 0 -> 25012 bytes
 .../Effects/Shaders/SmaaBlend.glsl            | 1404 ++++++
 .../Effects/Shaders/SmaaBlend.spv             |  Bin 0 -> 33728 bytes
 .../Effects/Shaders/SmaaEdge.glsl             | 1402 ++++++
 .../Effects/Shaders/SmaaEdge.spv              |  Bin 0 -> 8464 bytes
 .../Effects/Shaders/SmaaNeighbour.glsl        | 1403 ++++++
 .../Effects/Shaders/SmaaNeighbour.spv         |  Bin 0 -> 8328 bytes
 .../Effects/SmaaConstants.cs                  |   15 +
 .../Effects/SmaaPostProcessingEffect.cs       |  314 ++
 .../Effects/Textures/SmaaAreaTexture.bin      |  Bin 0 -> 179200 bytes
 .../Effects/Textures/SmaaSearchTexture.bin    |  Bin 0 -> 1024 bytes
 Ryujinx.Graphics.Vulkan/NativeArray.cs        |    7 +-
 Ryujinx.Graphics.Vulkan/PipelineBase.cs       |   27 +
 .../Ryujinx.Graphics.Vulkan.csproj            |   11 +
 Ryujinx.Graphics.Vulkan/Window.cs             |  167 +-
 Ryujinx.Graphics.Vulkan/WindowBase.cs         |    3 +
 .../Configuration/ConfigurationFileFormat.cs  |   17 +-
 .../Configuration/ConfigurationState.cs       |   41 +
 Ryujinx/Ui/RendererWidgetBase.cs              |   28 +
 Ryujinx/Ui/Windows/SettingsWindow.cs          |   12 +
 Ryujinx/Ui/Windows/SettingsWindow.glade       |  123 +-
 60 files changed, 21954 insertions(+), 26 deletions(-)
 create mode 100644 Ryujinx.Common/Configuration/AntiAliasing.cs
 create mode 100644 Ryujinx.Common/Configuration/ScalingFilter.cs
 create mode 100644 Ryujinx.Graphics.GAL/AntiAliasing.cs
 create mode 100644 Ryujinx.Graphics.GAL/UpscaleType.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin
 create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin
 create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin

diff --git a/Ryujinx.Ava/AppHost.cs b/Ryujinx.Ava/AppHost.cs
index 242c84e7fb..eb22b39e9b 100644
--- a/Ryujinx.Ava/AppHost.cs
+++ b/Ryujinx.Ava/AppHost.cs
@@ -171,6 +171,11 @@ namespace Ryujinx.Ava
             ConfigurationState.Instance.Graphics.AspectRatio.Event         += UpdateAspectRatioState;
             ConfigurationState.Instance.System.EnableDockedMode.Event      += UpdateDockedModeState;
             ConfigurationState.Instance.System.AudioVolume.Event           += UpdateAudioVolumeState;
+            ConfigurationState.Instance.System.EnableDockedMode.Event      += UpdateDockedModeState;
+            ConfigurationState.Instance.System.AudioVolume.Event           += UpdateAudioVolumeState;
+            ConfigurationState.Instance.Graphics.AntiAliasing.Event        += UpdateAntiAliasing;
+            ConfigurationState.Instance.Graphics.ScalingFilter.Event       += UpdateScalingFilter;
+            ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event  += UpdateScalingFilterLevel;
 
             _gpuCancellationTokenSource = new CancellationTokenSource();
         }
@@ -193,6 +198,17 @@ namespace Ryujinx.Ava
                 }
             }
         }
+        private void UpdateScalingFilterLevel(object sender, ReactiveEventArgs<int> e)
+        {
+            _renderer.Window?.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value);
+            _renderer.Window?.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value);
+        }
+
+        private void UpdateScalingFilter(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.ScalingFilter> e)
+        {
+            _renderer.Window?.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value);
+            _renderer.Window?.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value);
+        }
 
         private void ShowCursor()
         {
@@ -345,6 +361,11 @@ namespace Ryujinx.Ava
             }
         }
 
+        private void UpdateAntiAliasing(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.AntiAliasing> e)
+        {
+            _renderer?.Window?.SetAntiAliasing((Graphics.GAL.AntiAliasing)e.NewValue);
+        }
+
         private void UpdateDockedModeState(object sender, ReactiveEventArgs<bool> e)
         {
             Device?.System.ChangeDockedModeState(e.NewValue);
@@ -411,6 +432,9 @@ namespace Ryujinx.Ava
             ConfigurationState.Instance.Graphics.AspectRatio.Event         -= UpdateAspectRatioState;
             ConfigurationState.Instance.System.EnableDockedMode.Event      -= UpdateDockedModeState;
             ConfigurationState.Instance.System.AudioVolume.Event           -= UpdateAudioVolumeState;
+            ConfigurationState.Instance.Graphics.ScalingFilter.Event       -= UpdateScalingFilter;
+            ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event  -= UpdateScalingFilterLevel;
+            ConfigurationState.Instance.Graphics.AntiAliasing.Event        -= UpdateAntiAliasing;
 
             _topLevel.PointerMoved -= TopLevel_PointerMoved;
 
@@ -788,6 +812,10 @@ namespace Ryujinx.Ava
 
             Device.Gpu.Renderer.Initialize(_glLogLevel);
 
+            _renderer?.Window?.SetAntiAliasing((Graphics.GAL.AntiAliasing)ConfigurationState.Instance.Graphics.AntiAliasing.Value);
+            _renderer?.Window?.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value);
+            _renderer?.Window?.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value);
+
             Width = (int)_rendererHost.Bounds.Width;
             Height = (int)_rendererHost.Bounds.Height;
 
diff --git a/Ryujinx.Ava/Assets/Locales/en_US.json b/Ryujinx.Ava/Assets/Locales/en_US.json
index b7d1e02bf7..db8d242418 100644
--- a/Ryujinx.Ava/Assets/Locales/en_US.json
+++ b/Ryujinx.Ava/Assets/Locales/en_US.json
@@ -626,6 +626,16 @@
   "Recover": "Recover",
   "UserProfilesRecoverHeading" : "Saves were found for the following accounts",
   "UserProfilesRecoverEmptyList": "No profiles to recover",
+  "GraphicsAATooltip": "Applies anti-aliasing to the game render",
+  "GraphicsAALabel": "Anti-Aliasing:",
+  "GraphicsScalingFilterLabel": "Scaling Filter:",
+  "GraphicsScalingFilterTooltip": "Enables Framebuffer Scaling",
+  "GraphicsScalingFilterLevelLabel": "Level",
+  "GraphicsScalingFilterLevelTooltip": "Set Scaling Filter Level",
+  "SmaaLow": "SMAA Low",
+  "SmaaMedium": "SMAA Medium",
+  "SmaaHigh": "SMAA High",
+  "SmaaUltra": "SMAA Ultra",
   "UserEditorTitle" : "Edit User",
   "UserEditorTitleCreate" : "Create User"
 }
diff --git a/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs b/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs
index 36b37b0f52..7045c9ed3a 100644
--- a/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs
+++ b/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs
@@ -45,6 +45,8 @@ namespace Ryujinx.Ava.UI.ViewModels
         private KeyboardHotkeys _keyboardHotkeys;
         private int _graphicsBackendIndex;
         private string _customThemePath;
+        private int _scalingFilter;
+        private int _scalingFilterLevel;
 
         public event Action CloseWindow;
         public event Action SaveSettingsEvent;
@@ -153,6 +155,8 @@ namespace Ryujinx.Ava.UI.ViewModels
         public bool IsSDL2Enabled { get; set; }
         public bool EnableCustomTheme { get; set; }
         public bool IsCustomResolutionScaleActive => _resolutionScale == 4;
+        public bool IsScalingFilterActive => _scalingFilter == (int)Ryujinx.Common.Configuration.ScalingFilter.Fsr;
+
         public bool IsVulkanSelected => GraphicsBackendIndex == 0;
         public bool UseHypervisor { get; set; }
 
@@ -179,6 +183,18 @@ namespace Ryujinx.Ava.UI.ViewModels
         public int AudioBackend { get; set; }
         public int MaxAnisotropy { get; set; }
         public int AspectRatio { get; set; }
+        public int AntiAliasingEffect { get; set; }
+        public string ScalingFilterLevelText => ScalingFilterLevel.ToString("0");
+        public int ScalingFilterLevel
+        {
+            get => _scalingFilterLevel;
+            set
+            {
+                _scalingFilterLevel = value;
+                OnPropertyChanged();
+                OnPropertyChanged(nameof(ScalingFilterLevelText));
+            }
+        }
         public int OpenglDebugLevel { get; set; }
         public int MemoryMode { get; set; }
         public int BaseStyleIndex { get; set; }
@@ -192,6 +208,16 @@ namespace Ryujinx.Ava.UI.ViewModels
                 OnPropertyChanged(nameof(IsVulkanSelected));
             }
         }
+        public int ScalingFilter
+        {
+            get => _scalingFilter;
+            set
+            {
+                _scalingFilter = value;
+                OnPropertyChanged();
+                OnPropertyChanged(nameof(IsScalingFilterActive));
+            }
+        }
 
         public int PreferredGpuIndex { get; set; }
 
@@ -365,6 +391,9 @@ namespace Ryujinx.Ava.UI.ViewModels
             AspectRatio = (int)config.Graphics.AspectRatio.Value;
             GraphicsBackendMultithreadingIndex = (int)config.Graphics.BackendThreading.Value;
             ShaderDumpPath = config.Graphics.ShadersDumpPath;
+            AntiAliasingEffect = (int)config.Graphics.AntiAliasing.Value;
+            ScalingFilter = (int)config.Graphics.ScalingFilter.Value;
+            ScalingFilterLevel = config.Graphics.ScalingFilterLevel.Value;
 
             // Audio
             AudioBackend = (int)config.System.AudioBackend.Value;
@@ -447,6 +476,9 @@ namespace Ryujinx.Ava.UI.ViewModels
             config.Graphics.ResScaleCustom.Value = CustomResolutionScale;
             config.Graphics.MaxAnisotropy.Value = MaxAnisotropy == 0 ? -1 : MathF.Pow(2, MaxAnisotropy);
             config.Graphics.AspectRatio.Value = (AspectRatio)AspectRatio;
+            config.Graphics.AntiAliasing.Value = (AntiAliasing)AntiAliasingEffect;
+            config.Graphics.ScalingFilter.Value = (ScalingFilter)ScalingFilter;
+            config.Graphics.ScalingFilterLevel.Value = ScalingFilterLevel;
 
             if (ConfigurationState.Instance.Graphics.BackendThreading != (BackendThreading)GraphicsBackendMultithreadingIndex)
             {
diff --git a/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml b/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml
index fb30fb7f4b..8e4122f38d 100644
--- a/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml
+++ b/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml
@@ -7,6 +7,7 @@
     xmlns:ui="clr-namespace:FluentAvalonia.UI.Controls;assembly=FluentAvalonia"
     xmlns:locale="clr-namespace:Ryujinx.Ava.Common.Locale"
     xmlns:viewModels="clr-namespace:Ryujinx.Ava.UI.ViewModels"
+    Design.Width="1000"
     mc:Ignorable="d"
     x:CompileBindings="True"
     x:DataType="viewModels:SettingsViewModel">
@@ -111,6 +112,83 @@
                             Minimum="0.1"
                             Value="{Binding CustomResolutionScale}" />
                     </StackPanel>
+                    <StackPanel
+                        HorizontalAlignment="Stretch"
+                        Orientation="Vertical"
+                        Spacing="10">
+                        <StackPanel Orientation="Horizontal">
+                            <TextBlock VerticalAlignment="Center"
+                                       ToolTip.Tip="{locale:Locale GraphicsAATooltip}"
+                                       Text="{locale:Locale GraphicsAALabel}"
+                                       Width="250" />
+                            <ComboBox Width="350"
+                                      HorizontalContentAlignment="Left"
+                                      ToolTip.Tip="{locale:Locale GraphicsAATooltip}"
+                                      SelectedIndex="{Binding AntiAliasingEffect}">
+                                <ComboBoxItem>
+                                    <TextBlock Text="{locale:Locale SettingsTabLoggingGraphicsBackendLogLevelNone}" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="FXAA" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="{locale:Locale SmaaLow}" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="{locale:Locale SmaaMedium}" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="{locale:Locale SmaaHigh}" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="{locale:Locale SmaaUltra}" />
+                                </ComboBoxItem>
+                            </ComboBox>
+                        </StackPanel>
+                    </StackPanel>
+                    <StackPanel
+                        HorizontalAlignment="Stretch"
+                        Orientation="Vertical"
+                        Spacing="10">
+                        <StackPanel Orientation="Horizontal">
+                            <TextBlock VerticalAlignment="Center"
+                                       ToolTip.Tip="{locale:Locale GraphicsScalingFilterTooltip}"
+                                       Text="{locale:Locale GraphicsScalingFilterLabel}"
+                                       Width="250" />
+                            <ComboBox Width="350"
+                                      HorizontalContentAlignment="Left"
+                                      ToolTip.Tip="{locale:Locale GraphicsScalingFilterTooltip}"
+                                      SelectedIndex="{Binding ScalingFilter}">
+                                <ComboBoxItem>
+                                    <TextBlock Text="Bilinear" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="Nearest" />
+                                </ComboBoxItem>
+                                <ComboBoxItem>
+                                    <TextBlock Text="FSR" />
+                                </ComboBoxItem>
+                            </ComboBox>
+                            <Slider Value="{Binding ScalingFilterLevel}"
+                                    ToolTip.Tip="{locale:Locale GraphicsScalingFilterLevelTooltip}"
+                                    MinWidth="150"
+                                    Margin="10,-3,0,0"
+                                    Height="32"
+                                    Padding="0,-5"
+                                    IsVisible="{Binding IsScalingFilterActive}"
+                                    TickFrequency="1"
+                                    IsSnapToTickEnabled="True"
+                                    LargeChange="10"
+                                    SmallChange="1"
+                                    VerticalAlignment="Center"
+                                    Minimum="0"
+                                    Maximum="100" />
+                            <TextBlock Margin="5,0"
+                                       Width="40"
+                                       IsVisible="{Binding IsScalingFilterActive}"
+                                       Text="{Binding ScalingFilterLevelText}"/>
+                        </StackPanel>
+                    </StackPanel>
                     <StackPanel Orientation="Horizontal">
                         <TextBlock VerticalAlignment="Center"
                                    ToolTip.Tip="{locale:Locale AnisotropyTooltip}"
diff --git a/Ryujinx.Common/Configuration/AntiAliasing.cs b/Ryujinx.Common/Configuration/AntiAliasing.cs
new file mode 100644
index 0000000000..6543598c75
--- /dev/null
+++ b/Ryujinx.Common/Configuration/AntiAliasing.cs
@@ -0,0 +1,12 @@
+namespace Ryujinx.Common.Configuration
+{
+    public enum AntiAliasing
+    {
+        None,
+        Fxaa,
+        SmaaLow,
+        SmaaMedium,
+        SmaaHigh,
+        SmaaUltra
+    }
+}
diff --git a/Ryujinx.Common/Configuration/ScalingFilter.cs b/Ryujinx.Common/Configuration/ScalingFilter.cs
new file mode 100644
index 0000000000..2095b89b1a
--- /dev/null
+++ b/Ryujinx.Common/Configuration/ScalingFilter.cs
@@ -0,0 +1,9 @@
+namespace Ryujinx.Common.Configuration
+{
+    public enum ScalingFilter
+    {
+        Bilinear,
+        Nearest,
+        Fsr
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.GAL/AntiAliasing.cs b/Ryujinx.Graphics.GAL/AntiAliasing.cs
new file mode 100644
index 0000000000..d4e5754d8e
--- /dev/null
+++ b/Ryujinx.Graphics.GAL/AntiAliasing.cs
@@ -0,0 +1,12 @@
+namespace Ryujinx.Graphics.GAL
+{
+    public enum AntiAliasing
+    {
+        None,
+        Fxaa,
+        SmaaLow,
+        SmaaMedium,
+        SmaaHigh,
+        SmaaUltra
+    }
+}
diff --git a/Ryujinx.Graphics.GAL/IWindow.cs b/Ryujinx.Graphics.GAL/IWindow.cs
index a9bbbc5e02..1221d685a8 100644
--- a/Ryujinx.Graphics.GAL/IWindow.cs
+++ b/Ryujinx.Graphics.GAL/IWindow.cs
@@ -9,5 +9,9 @@ namespace Ryujinx.Graphics.GAL
         void SetSize(int width, int height);
 
         void ChangeVSyncMode(bool vsyncEnabled);
+
+        void SetAntiAliasing(AntiAliasing antialiasing);
+        void SetScalingFilter(ScalingFilter type);
+        void SetScalingFilterLevel(float level);
     }
 }
diff --git a/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs b/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs
index c4b62a25d3..a647d37eba 100644
--- a/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs
+++ b/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs
@@ -32,5 +32,11 @@ namespace Ryujinx.Graphics.GAL.Multithreading
         }
 
         public void ChangeVSyncMode(bool vsyncEnabled) { }
+
+        public void SetAntiAliasing(AntiAliasing effect) { }
+
+        public void SetScalingFilter(ScalingFilter type) { }
+
+        public void SetScalingFilterLevel(float level) { }
     }
 }
diff --git a/Ryujinx.Graphics.GAL/UpscaleType.cs b/Ryujinx.Graphics.GAL/UpscaleType.cs
new file mode 100644
index 0000000000..442b65f242
--- /dev/null
+++ b/Ryujinx.Graphics.GAL/UpscaleType.cs
@@ -0,0 +1,9 @@
+namespace Ryujinx.Graphics.GAL
+{
+    public enum ScalingFilter
+    {
+        Bilinear,
+        Nearest,
+        Fsr
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs b/Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs
new file mode 100644
index 0000000000..16678bb7b9
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs
@@ -0,0 +1,177 @@
+using OpenTK.Graphics.OpenGL;
+using Ryujinx.Common;
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.OpenGL.Image;
+using System;
+using static Ryujinx.Graphics.OpenGL.Effects.ShaderHelper;
+
+namespace Ryujinx.Graphics.OpenGL.Effects
+{
+    internal class FsrScalingFilter : IScalingFilter
+    {
+        private readonly OpenGLRenderer _renderer;
+        private int _inputUniform;
+        private int _outputUniform;
+        private int _sharpeningUniform;
+        private int _srcX0Uniform;
+        private int _srcX1Uniform;
+        private int _srcY0Uniform;
+        private int _scalingShaderProgram;
+        private int _sharpeningShaderProgram;
+        private float _scale = 1;
+        private int _srcY1Uniform;
+        private int _dstX0Uniform;
+        private int _dstX1Uniform;
+        private int _dstY0Uniform;
+        private int _dstY1Uniform;
+        private int _scaleXUniform;
+        private int _scaleYUniform;
+        private TextureStorage _intermediaryTexture;
+
+        public float Level
+        {
+            get => _scale;
+            set
+            {
+                _scale = MathF.Max(0.01f, value);
+            }
+        }
+
+        public FsrScalingFilter(OpenGLRenderer renderer, IPostProcessingEffect filter)
+        {
+            Initialize();
+
+            _renderer = renderer;
+        }
+
+        public void Dispose()
+        {
+            if (_scalingShaderProgram != 0)
+            {
+                GL.DeleteProgram(_scalingShaderProgram);
+                GL.DeleteProgram(_sharpeningShaderProgram);
+            }
+
+            _intermediaryTexture?.Dispose();
+        }
+
+        private void Initialize()
+        {
+            var scalingShader = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl");
+            var sharpeningShader = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl");
+            var fsrA = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h");
+            var fsr1 = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h");
+
+            scalingShader = scalingShader.Replace("#include \"ffx_a.h\"", fsrA);
+            scalingShader = scalingShader.Replace("#include \"ffx_fsr1.h\"", fsr1);
+            sharpeningShader = sharpeningShader.Replace("#include \"ffx_a.h\"", fsrA);
+            sharpeningShader = sharpeningShader.Replace("#include \"ffx_fsr1.h\"", fsr1);
+
+            _scalingShaderProgram = CompileProgram(scalingShader, ShaderType.ComputeShader);
+            _sharpeningShaderProgram = CompileProgram(sharpeningShader, ShaderType.ComputeShader);
+
+            _inputUniform = GL.GetUniformLocation(_scalingShaderProgram, "Source");
+            _outputUniform = GL.GetUniformLocation(_scalingShaderProgram, "imgOutput");
+            _sharpeningUniform = GL.GetUniformLocation(_sharpeningShaderProgram, "sharpening");
+
+            _srcX0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcX0");
+            _srcX1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcX1");
+            _srcY0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcY0");
+            _srcY1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcY1");
+            _dstX0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstX0");
+            _dstX1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstX1");
+            _dstY0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstY0");
+            _dstY1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstY1");
+            _scaleXUniform = GL.GetUniformLocation(_scalingShaderProgram, "scaleX");
+            _scaleYUniform = GL.GetUniformLocation(_scalingShaderProgram, "scaleY");
+        }
+
+        public void Run(
+            TextureView view,
+            TextureView destinationTexture,
+            int width,
+            int height,
+            Extents2D source,
+            Extents2D destination)
+        {
+            if (_intermediaryTexture == null || _intermediaryTexture.Info.Width != width || _intermediaryTexture.Info.Height != height)
+            {
+                _intermediaryTexture?.Dispose();
+                var originalInfo = view.Info;
+                var info = new TextureCreateInfo(width,
+                    height,
+                    originalInfo.Depth,
+                    originalInfo.Levels,
+                    originalInfo.Samples,
+                    originalInfo.BlockWidth,
+                    originalInfo.BlockHeight,
+                    originalInfo.BytesPerPixel,
+                    originalInfo.Format,
+                    originalInfo.DepthStencilMode,
+                    originalInfo.Target,
+                    originalInfo.SwizzleR,
+                    originalInfo.SwizzleG,
+                    originalInfo.SwizzleB,
+                    originalInfo.SwizzleA);
+
+                _intermediaryTexture = new TextureStorage(_renderer, info, view.ScaleFactor);
+                _intermediaryTexture.CreateDefaultView();
+            }
+
+            var textureView = _intermediaryTexture.CreateView(_intermediaryTexture.Info, 0, 0) as TextureView;
+
+            int previousProgram = GL.GetInteger(GetPName.CurrentProgram);
+            int previousUnit = GL.GetInteger(GetPName.ActiveTexture);
+            GL.ActiveTexture(TextureUnit.Texture0);
+            int previousTextureBinding = GL.GetInteger(GetPName.TextureBinding2D);
+
+            GL.BindImageTexture(0, textureView.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8);
+
+            int threadGroupWorkRegionDim = 16;
+            int dispatchX = (width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim;
+            int dispatchY = (height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim;
+
+            // Scaling pass
+            float srcWidth = Math.Abs(source.X2 - source.X1);
+            float srcHeight = Math.Abs(source.Y2 - source.Y1);
+            float scaleX = srcWidth / view.Width;
+            float scaleY = srcHeight / view.Height;
+            GL.UseProgram(_scalingShaderProgram);
+            view.Bind(0);
+            GL.Uniform1(_inputUniform, 0);
+            GL.Uniform1(_outputUniform, 0);
+            GL.Uniform1(_srcX0Uniform, (float)source.X1);
+            GL.Uniform1(_srcX1Uniform, (float)source.X2);
+            GL.Uniform1(_srcY0Uniform, (float)source.Y1);
+            GL.Uniform1(_srcY1Uniform, (float)source.Y2);
+            GL.Uniform1(_dstX0Uniform, (float)destination.X1);
+            GL.Uniform1(_dstX1Uniform, (float)destination.X2);
+            GL.Uniform1(_dstY0Uniform, (float)destination.Y1);
+            GL.Uniform1(_dstY1Uniform, (float)destination.Y2);
+            GL.Uniform1(_scaleXUniform, scaleX);
+            GL.Uniform1(_scaleYUniform, scaleY);
+            GL.DispatchCompute(dispatchX, dispatchY, 1);
+
+            GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit);
+
+            // Sharpening Pass
+            GL.UseProgram(_sharpeningShaderProgram);
+            GL.BindImageTexture(0, destinationTexture.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8);
+            textureView.Bind(0);
+            GL.Uniform1(_inputUniform, 0);
+            GL.Uniform1(_outputUniform, 0);
+            GL.Uniform1(_sharpeningUniform, 1.5f - (Level * 0.01f * 1.5f));
+            GL.DispatchCompute(dispatchX, dispatchY, 1);
+
+            GL.UseProgram(previousProgram);
+            GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit);
+
+            (_renderer.Pipeline as Pipeline).RestoreImages1And2();
+
+            GL.ActiveTexture(TextureUnit.Texture0);
+            GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding);
+
+            GL.ActiveTexture((TextureUnit)previousUnit);
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs b/Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs
new file mode 100644
index 0000000000..3a2d685b73
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs
@@ -0,0 +1,81 @@
+using OpenTK.Graphics.OpenGL;
+using Ryujinx.Common;
+using Ryujinx.Graphics.OpenGL.Image;
+
+namespace Ryujinx.Graphics.OpenGL.Effects
+{
+    internal class FxaaPostProcessingEffect : IPostProcessingEffect
+    {
+        private readonly OpenGLRenderer _renderer;
+        private int _resolutionUniform;
+        private int _inputUniform;
+        private int _outputUniform;
+        private int _shaderProgram;
+        private TextureStorage _textureStorage;
+
+        public FxaaPostProcessingEffect(OpenGLRenderer renderer)
+        {
+            Initialize();
+
+            _renderer = renderer;
+        }
+
+        public void Dispose()
+        {
+            if (_shaderProgram != 0)
+            {
+                GL.DeleteProgram(_shaderProgram);
+                _textureStorage?.Dispose();
+            }
+        }
+
+        private void Initialize()
+        {
+            _shaderProgram = ShaderHelper.CompileProgram(EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl"), ShaderType.ComputeShader);
+
+            _resolutionUniform = GL.GetUniformLocation(_shaderProgram, "invResolution");
+            _inputUniform = GL.GetUniformLocation(_shaderProgram, "inputTexture");
+            _outputUniform = GL.GetUniformLocation(_shaderProgram, "imgOutput");
+        }
+
+        public TextureView Run(TextureView view, int width, int height)
+        {
+            if (_textureStorage == null || _textureStorage.Info.Width != view.Width || _textureStorage.Info.Height != view.Height)
+            {
+                _textureStorage?.Dispose();
+                _textureStorage = new TextureStorage(_renderer, view.Info, view.ScaleFactor);
+                _textureStorage.CreateDefaultView();
+            }
+
+            var textureView = _textureStorage.CreateView(view.Info, 0, 0) as TextureView;
+
+            int previousProgram = GL.GetInteger(GetPName.CurrentProgram);
+            int previousUnit = GL.GetInteger(GetPName.ActiveTexture);
+            GL.ActiveTexture(TextureUnit.Texture0);
+            int previousTextureBinding = GL.GetInteger(GetPName.TextureBinding2D);
+
+            GL.BindImageTexture(0, textureView.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8);
+            GL.UseProgram(_shaderProgram);
+
+            var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize);
+            var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize);
+
+            view.Bind(0);
+            GL.Uniform1(_inputUniform, 0);
+            GL.Uniform1(_outputUniform, 0);
+            GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height);
+            GL.DispatchCompute(dispatchX, dispatchY, 1);
+            GL.UseProgram(previousProgram);
+            GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit);
+
+            (_renderer.Pipeline as Pipeline).RestoreImages1And2();
+
+            GL.ActiveTexture(TextureUnit.Texture0);
+            GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding);
+
+            GL.ActiveTexture((TextureUnit)previousUnit);
+
+            return textureView;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs b/Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs
new file mode 100644
index 0000000000..7a045a0212
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs
@@ -0,0 +1,11 @@
+using Ryujinx.Graphics.OpenGL.Image;
+using System;
+
+namespace Ryujinx.Graphics.OpenGL.Effects
+{
+    internal interface IPostProcessingEffect :  IDisposable
+    {
+        const int LocalGroupSize = 64;
+        TextureView Run(TextureView view, int width, int height);
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs b/Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs
new file mode 100644
index 0000000000..e1e1b2c1d0
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs
@@ -0,0 +1,18 @@
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.OpenGL.Image;
+using System;
+
+namespace Ryujinx.Graphics.OpenGL.Effects
+{
+    internal interface IScalingFilter : IDisposable
+    {
+        float Level { get; set; }
+        void Run(
+            TextureView view,
+            TextureView destinationTexture,
+            int width,
+            int height,
+            Extents2D source,
+            Extents2D destination);
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs b/Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs
new file mode 100644
index 0000000000..72c5a98f54
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs
@@ -0,0 +1,40 @@
+using OpenTK.Graphics.OpenGL;
+using System;
+
+namespace Ryujinx.Graphics.OpenGL.Effects
+{
+    internal static class ShaderHelper
+    {
+        public static int CompileProgram(string shaderCode, ShaderType shaderType)
+        {
+            var shader = GL.CreateShader(shaderType);
+            GL.ShaderSource(shader, shaderCode);
+            GL.CompileShader(shader);
+
+            var program = GL.CreateProgram();
+            GL.AttachShader(program, shader);
+            GL.LinkProgram(program);
+
+            GL.DetachShader(program, shader);
+            GL.DeleteShader(shader);
+
+            return program;
+        }
+
+        public static int CompileProgram(string[] shaders, ShaderType shaderType)
+        {
+            var shader = GL.CreateShader(shaderType);
+            GL.ShaderSource(shader, shaders.Length, shaders, (int[])null);
+            GL.CompileShader(shader);
+
+            var program = GL.CreateProgram();
+            GL.AttachShader(program, shader);
+            GL.LinkProgram(program);
+
+            GL.DetachShader(program, shader);
+            GL.DeleteShader(shader);
+
+            return program;
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h
new file mode 100644
index 0000000000..d04bff55cb
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h
@@ -0,0 +1,2656 @@
+//==============================================================================================================================
+//
+//                                               [A] SHADER PORTABILITY 1.20210629
+//
+//==============================================================================================================================
+// FidelityFX Super Resolution Sample
+//
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// MIT LICENSE
+// ===========
+// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
+// -----------
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// -----------
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+// Software.
+// -----------
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// Common central point for high-level shading language and C portability for various shader headers.
+//------------------------------------------------------------------------------------------------------------------------------
+// DEFINES
+// =======
+// A_CPU ..... Include the CPU related code.
+// A_GPU ..... Include the GPU related code.
+// A_GLSL .... Using GLSL.
+// A_HLSL .... Using HLSL.
+// A_HLSL_6_2  Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
+// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
+// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
+// =======
+// A_BYTE .... Support 8-bit integer.
+// A_HALF .... Support 16-bit integer and floating point.
+// A_LONG .... Support 64-bit integer.
+// A_DUBL .... Support 64-bit floating point.
+// =======
+// A_WAVE .... Support wave-wide operations.
+//------------------------------------------------------------------------------------------------------------------------------
+// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
+//------------------------------------------------------------------------------------------------------------------------------
+// SIMPLIFIED TYPE SYSTEM
+// ======================
+//  - All ints will be unsigned with exception of when signed is required.
+//  - Type naming simplified and shortened "A<type><#components>",
+//     - H = 16-bit float (half)
+//     - F = 32-bit float (float)
+//     - D = 64-bit float (double)
+//     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
+//     - B = 8-bit integer (byte)
+//     - W = 16-bit integer (word)
+//     - U = 32-bit integer (unsigned)
+//     - L = 64-bit integer (long)
+//  - Using "AS<type><#components>" for signed when required.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
+//------------------------------------------------------------------------------------------------------------------------------
+// CHANGE LOG
+// ==========
+// 20200914 - Expanded wave ops and prx code.
+// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                           COMMON
+//==============================================================================================================================
+#define A_2PI 6.28318530718
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                             CPU
+//
+//
+//==============================================================================================================================
+#ifdef A_CPU
+ // Supporting user defined overrides.
+ #ifndef A_RESTRICT
+  #define A_RESTRICT __restrict
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifndef A_STATIC
+  #define A_STATIC static
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ // Same types across CPU and GPU.
+ // Predicate uses 32-bit integer (C friendly bool).
+ typedef uint32_t AP1;
+ typedef float AF1;
+ typedef double AD1;
+ typedef uint8_t AB1;
+ typedef uint16_t AW1;
+ typedef uint32_t AU1;
+ typedef uint64_t AL1;
+ typedef int8_t ASB1;
+ typedef int16_t ASW1;
+ typedef int32_t ASU1;
+ typedef int64_t ASL1;
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AD1_(a) ((AD1)(a))
+ #define AF1_(a) ((AF1)(a))
+ #define AL1_(a) ((AL1)(a))
+ #define AU1_(a) ((AU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASL1_(a) ((ASL1)(a))
+ #define ASU1_(a) ((ASU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
+//------------------------------------------------------------------------------------------------------------------------------
+ #define A_TRUE 1
+ #define A_FALSE 0
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                       CPU/GPU PORTING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Get CPU and GPU to share all setup code, without duplicate code paths.
+// This uses a lower-case prefix for special vector constructs.
+//  - In C restrict pointers are used.
+//  - In the shading language, in/inout/out arguments are used.
+// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD1 *A_RESTRICT
+ #define retAD3 AD1 *A_RESTRICT
+ #define retAD4 AD1 *A_RESTRICT
+ #define retAF2 AF1 *A_RESTRICT
+ #define retAF3 AF1 *A_RESTRICT
+ #define retAF4 AF1 *A_RESTRICT
+ #define retAL2 AL1 *A_RESTRICT
+ #define retAL3 AL1 *A_RESTRICT
+ #define retAL4 AL1 *A_RESTRICT
+ #define retAU2 AU1 *A_RESTRICT
+ #define retAU3 AU1 *A_RESTRICT
+ #define retAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 AD1 *A_RESTRICT
+ #define inAD3 AD1 *A_RESTRICT
+ #define inAD4 AD1 *A_RESTRICT
+ #define inAF2 AF1 *A_RESTRICT
+ #define inAF3 AF1 *A_RESTRICT
+ #define inAF4 AF1 *A_RESTRICT
+ #define inAL2 AL1 *A_RESTRICT
+ #define inAL3 AL1 *A_RESTRICT
+ #define inAL4 AL1 *A_RESTRICT
+ #define inAU2 AU1 *A_RESTRICT
+ #define inAU3 AU1 *A_RESTRICT
+ #define inAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 AD1 *A_RESTRICT
+ #define inoutAD3 AD1 *A_RESTRICT
+ #define inoutAD4 AD1 *A_RESTRICT
+ #define inoutAF2 AF1 *A_RESTRICT
+ #define inoutAF3 AF1 *A_RESTRICT
+ #define inoutAF4 AF1 *A_RESTRICT
+ #define inoutAL2 AL1 *A_RESTRICT
+ #define inoutAL3 AL1 *A_RESTRICT
+ #define inoutAL4 AL1 *A_RESTRICT
+ #define inoutAU2 AU1 *A_RESTRICT
+ #define inoutAU3 AU1 *A_RESTRICT
+ #define inoutAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 AD1 *A_RESTRICT
+ #define outAD3 AD1 *A_RESTRICT
+ #define outAD4 AD1 *A_RESTRICT
+ #define outAF2 AF1 *A_RESTRICT
+ #define outAF3 AF1 *A_RESTRICT
+ #define outAF4 AF1 *A_RESTRICT
+ #define outAL2 AL1 *A_RESTRICT
+ #define outAL3 AL1 *A_RESTRICT
+ #define outAL4 AL1 *A_RESTRICT
+ #define outAU2 AU1 *A_RESTRICT
+ #define outAU3 AU1 *A_RESTRICT
+ #define outAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD1 x[2]
+ #define varAD3(x) AD1 x[3]
+ #define varAD4(x) AD1 x[4]
+ #define varAF2(x) AF1 x[2]
+ #define varAF3(x) AF1 x[3]
+ #define varAF4(x) AF1 x[4]
+ #define varAL2(x) AL1 x[2]
+ #define varAL3(x) AL1 x[3]
+ #define varAL4(x) AL1 x[4]
+ #define varAU2(x) AU1 x[2]
+ #define varAU3(x) AU1 x[3]
+ #define varAU4(x) AU1 x[4]
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) {x,y}
+ #define initAD3(x,y,z) {x,y,z}
+ #define initAD4(x,y,z,w) {x,y,z,w}
+ #define initAF2(x,y) {x,y}
+ #define initAF3(x,y,z) {x,y,z}
+ #define initAF4(x,y,z,w) {x,y,z,w}
+ #define initAL2(x,y) {x,y}
+ #define initAL3(x,y,z) {x,y,z}
+ #define initAL4(x,y,z,w) {x,y,z,w}
+ #define initAU2(x,y) {x,y}
+ #define initAU3(x,y,z) {x,y,z}
+ #define initAU4(x,y,z,w) {x,y,z,w}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Replace transcendentals with manual versions. 
+//==============================================================================================================================
+ #ifdef A_GCC
+  A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
+ #else
+  A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
+ #else
+  A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+ A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
+ #else
+  A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
+ #else
+  A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
+ A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
+ #else
+  A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
+ A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
+ A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
+ A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // These follow the convention that A integer types don't have signage, until they are operated on. 
+ A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
+ A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
+ A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
+ A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
+ A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
+ A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
+ A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
+ A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
+ #else
+  A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
+ #else
+  A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
+ A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
+ A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
+ A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
+ A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
+ A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
+ A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
+ A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
+ A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
+ A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
+ A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
+ A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
+ A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
+ A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
+ A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
+ A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
+ A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
+ A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
+ A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
+ A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
+ A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
+ A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF FLOAT PACKING
+//==============================================================================================================================
+ // Convert float to half (in lower 16-bits of output).
+ // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+ // Supports denormals.
+ // Conversion rules are to make computations possibly "safer" on the GPU,
+ //  -INF & -NaN -> -65504
+ //  +INF & +NaN -> +65504
+ A_STATIC AU1 AU1_AH1_AF1(AF1 f){
+  static AW1 base[512]={
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
+   0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
+   0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
+   0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
+   0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
+  static AB1 shift[512]={
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
+  union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Used to output packed constant.
+ A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            GLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_GLSL) && defined(A_GPU)
+ #ifndef A_SKIP_EXT
+  #ifdef A_HALF
+   #extension GL_EXT_shader_16bit_storage:require
+   #extension GL_EXT_shader_explicit_arithmetic_types:require 
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_LONG
+   #extension GL_ARB_gpu_shader_int64:require
+   #extension GL_NV_shader_atomic_int64:require
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_WAVE
+   #extension GL_KHR_shader_subgroup_arithmetic:require
+   #extension GL_KHR_shader_subgroup_ballot:require
+   #extension GL_KHR_shader_subgroup_quad:require
+   #extension GL_KHR_shader_subgroup_shuffle:require
+  #endif
+ #endif
+//==============================================================================================================================
+ #define AP1 bool
+ #define AP2 bvec2
+ #define AP3 bvec3
+ #define AP4 bvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF1 float
+ #define AF2 vec2
+ #define AF3 vec3
+ #define AF4 vec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1 uint
+ #define AU2 uvec2
+ #define AU3 uvec3
+ #define AU4 uvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASU1 int
+ #define ASU2 ivec2
+ #define ASU3 ivec3
+ #define ASU4 ivec4
+//==============================================================================================================================
+ #define AF1_AU1(x) uintBitsToFloat(AU1(x))
+ #define AF2_AU2(x) uintBitsToFloat(AU2(x))
+ #define AF3_AU3(x) uintBitsToFloat(AU3(x))
+ #define AF4_AU4(x) uintBitsToFloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) floatBitsToUint(AF1(x))
+ #define AU2_AF2(x) floatBitsToUint(AF2(x))
+ #define AU3_AF3(x) floatBitsToUint(AF3(x))
+ #define AU4_AF4(x) floatBitsToUint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
+ #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AH2_AF2 packHalf2x16
+ #define AU1_AW2Unorm_AF2 packUnorm2x16
+ #define AU1_AB4Unorm_AF4 packUnorm4x8
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF2_AH2_AU1 unpackHalf2x16
+ #define AF2_AW2Unorm_AU1 unpackUnorm2x16
+ #define AF4_AB4Unorm_AU1 unpackUnorm4x8
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MED3_F32.
+ AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
+ AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
+ AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
+ AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F32 (note DX frac() is different).
+ AF1 AFractF1(AF1 x){return fract(x);}
+ AF2 AFractF2(AF2 x){return fract(x);}
+ AF3 AFractF3(AF3 x){return fract(x);}
+ AF4 AFractF4(AF4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MAX3_F32.
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Clamp has an easier pattern match for med3 when some ordering is known.
+ // V_MED3_F32.
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MIN3_F32.
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_COS_F32.
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_SIN_F32.
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
+ AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
+ AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
+ AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
+ AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
+ AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
+ AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
+ AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
+ AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
+ AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+  #define AB1 uint8_t
+  #define AB2 u8vec2
+  #define AB3 u8vec3
+  #define AB4 u8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASB1 int8_t
+  #define ASB2 i8vec2
+  #define ASB3 i8vec3
+  #define ASB4 i8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AB1 AB1_x(AB1 a){return AB1(a);}
+  AB2 AB2_x(AB1 a){return AB2(a,a);}
+  AB3 AB3_x(AB1 a){return AB3(a,a,a);}
+  AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
+  #define AB1_(a) AB1_x(AB1(a))
+  #define AB2_(a) AB2_x(AB1(a))
+  #define AB3_(a) AB3_x(AB1(a))
+  #define AB4_(a) AB4_x(AB1(a))
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #define AH1 float16_t
+  #define AH2 f16vec2
+  #define AH3 f16vec3
+  #define AH4 f16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AW1 uint16_t
+  #define AW2 u16vec2
+  #define AW3 u16vec3
+  #define AW4 u16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASW1 int16_t
+  #define ASW2 i16vec2
+  #define ASW3 i16vec3
+  #define ASW4 i16vec4
+//==============================================================================================================================
+  #define AH2_AU1(x) unpackFloat2x16(AU1(x))
+  AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) unpackUint2x16(AU1(x))
+  #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1_AH2(x) packFloat2x16(AH2(x))
+  AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) packUint2x16(AW2(x))
+  #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
+//==============================================================================================================================
+  #define AW1_AH1(x) halfBitsToUint16(AH1(x))
+  #define AW2_AH2(x) halfBitsToUint16(AH2(x))
+  #define AW3_AH3(x) halfBitsToUint16(AH3(x))
+  #define AW4_AH4(x) halfBitsToUint16(AH4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
+  #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
+  #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
+  #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
+  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
+  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
+  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFractH1(AH1 x){return fract(x);}
+  AH2 AFractH2(AH2 x){return fract(x);}
+  AH3 AFractH3(AH3 x){return fract(x);}
+  AH4 AFractH4(AH4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of max3.
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of min3.
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
+  AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
+  AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
+  AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
+  AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
+  AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
+  AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
+  AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
+  AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
+  AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #define AD1 double
+  #define AD2 dvec2
+  #define AD3 dvec3
+  #define AD4 dvec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 x){return fract(x);}
+  AD2 AFractD2(AD2 x){return fract(x);}
+  AD3 AFractD3(AD3 x){return fract(x);}
+  AD4 AFractD4(AD4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
+  AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
+  AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
+  AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
+  AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
+  AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
+  AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
+  AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
+  AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
+  AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL LONG
+//==============================================================================================================================
+ #ifdef A_LONG
+  #define AL1 uint64_t
+  #define AL2 u64vec2
+  #define AL3 u64vec3
+  #define AL4 u64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASL1 int64_t
+  #define ASL2 i64vec2
+  #define ASL3 i64vec3
+  #define ASL4 i64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AL1_AU2(x) packUint2x32(AU2(x))
+  #define AU2_AL1(x) unpackUint2x32(AL1(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AL1_x(AL1 a){return AL1(a);}
+  AL2 AL2_x(AL1 a){return AL2(a,a);}
+  AL3 AL3_x(AL1 a){return AL3(a,a,a);}
+  AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
+  #define AL1_(a) AL1_x(AL1(a))
+  #define AL2_(a) AL2_x(AL1(a))
+  #define AL3_(a) AL3_x(AL1(a))
+  #define AL4_(a) AL4_x(AL1(a))
+//==============================================================================================================================
+  AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
+  AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
+  AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
+  AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
+  AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
+  AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
+  AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
+  AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
+  AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
+  AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      WAVE OPERATIONS
+//==============================================================================================================================
+ #ifdef A_WAVE
+  // Where 'x' must be a compile time literal.
+  AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_HALF
+   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
+   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
+   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
+   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
+  #endif
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            HLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_HLSL) && defined(A_GPU)
+ #ifdef A_HLSL_6_2
+  #define AP1 bool
+  #define AP2 bool2
+  #define AP3 bool3
+  #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AF1 float32_t
+  #define AF2 float32_t2
+  #define AF3 float32_t3
+  #define AF4 float32_t4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1 uint32_t
+  #define AU2 uint32_t2
+  #define AU3 uint32_t3
+  #define AU4 uint32_t4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASU1 int32_t
+  #define ASU2 int32_t2
+  #define ASU3 int32_t3
+  #define ASU4 int32_t4
+ #else
+  #define AP1 bool
+  #define AP2 bool2
+  #define AP3 bool3
+  #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AF1 float
+  #define AF2 float2
+  #define AF3 float3
+  #define AF4 float4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1 uint
+  #define AU2 uint2
+  #define AU3 uint3
+  #define AU4 uint4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASU1 int
+  #define ASU2 int2
+  #define ASU3 int3
+  #define ASU4 int4
+ #endif
+//==============================================================================================================================
+ #define AF1_AU1(x) asfloat(AU1(x))
+ #define AF2_AU2(x) asfloat(AU2(x))
+ #define AF3_AU3(x) asfloat(AU3(x))
+ #define AF4_AU4(x) asfloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) asuint(AF1(x))
+ #define AU2_AF2(x) asuint(AF2(x))
+ #define AU3_AF3(x) asuint(AF3(x))
+ #define AU4_AF4(x) asuint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
+ #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
+ #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
+ #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
+ #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
+ AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
+ AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
+ AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFractF1(AF1 x){return x-floor(x);}
+ AF2 AFractF2(AF2 x){return x-floor(x);}
+ AF3 AFractF3(AF3 x){return x-floor(x);}
+ AF4 AFractF4(AF4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return rcp(x);}
+ AF2 ARcpF2(AF2 x){return rcp(x);}
+ AF3 ARcpF3(AF3 x){return rcp(x);}
+ AF4 ARcpF4(AF4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return rsqrt(x);}
+ AF2 ARsqF2(AF2 x){return rsqrt(x);}
+ AF3 ARsqF3(AF3 x){return rsqrt(x);}
+ AF4 ARsqF4(AF4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return saturate(x);}
+ AF2 ASatF2(AF2 x){return saturate(x);}
+ AF3 ASatF3(AF3 x){return saturate(x);}
+ AF4 ASatF4(AF4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #ifdef A_HLSL_6_2
+   #define AH1 float16_t
+   #define AH2 float16_t2
+   #define AH3 float16_t3
+   #define AH4 float16_t4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define AW1 uint16_t
+   #define AW2 uint16_t2
+   #define AW3 uint16_t3
+   #define AW4 uint16_t4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define ASW1 int16_t
+   #define ASW2 int16_t2
+   #define ASW3 int16_t3
+   #define ASW4 int16_t4
+  #else
+   #define AH1 min16float
+   #define AH2 min16float2
+   #define AH3 min16float3
+   #define AH4 min16float4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define AW1 min16uint
+   #define AW2 min16uint2
+   #define AW3 min16uint3
+   #define AW4 min16uint4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define ASW1 min16int
+   #define ASW2 min16int2
+   #define ASW3 min16int3
+   #define ASW4 min16int4
+  #endif
+//==============================================================================================================================
+  // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
+  // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
+  AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
+  AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
+  AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
+  AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
+  #define AH2_AU1(x) AH2_AU1_x(AU1(x))
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) AW2_AU1_x(AU1(x))
+  #define AW4_AU2(x) AW4_AU2_x(AU2(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
+  AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
+  AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
+  AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
+  #define AU1_AH2(x) AU1_AH2_x(AH2(x))
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) AU1_AW2_x(AW2(x))
+  #define AU2_AW4(x) AU2_AW4_x(AW4(x))
+//==============================================================================================================================
+  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
+   #define AW1_AH1(x) asuint16(x)
+   #define AW2_AH2(x) asuint16(x)
+   #define AW3_AH3(x) asuint16(x)
+   #define AW4_AH4(x) asuint16(x)
+  #else
+   #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
+   #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
+   #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
+   #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
+   #define AH1_AW1(x) asfloat16(x)
+   #define AH2_AW2(x) asfloat16(x)
+   #define AH3_AW3(x) asfloat16(x)
+   #define AH4_AW4(x) asfloat16(x)
+  #else
+   #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
+   #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
+   #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
+   #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
+  #endif
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
+  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
+  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
+  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F16 (note DX frac() is different).
+  AH1 AFractH1(AH1 x){return x-floor(x);}
+  AH2 AFractH2(AH2 x){return x-floor(x);}
+  AH3 AFractH3(AH3 x){return x-floor(x);}
+  AH4 AFractH4(AH4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return rcp(x);}
+  AH2 ARcpH2(AH2 x){return rcp(x);}
+  AH3 ARcpH3(AH3 x){return rcp(x);}
+  AH4 ARcpH4(AH4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return rsqrt(x);}
+  AH2 ARsqH2(AH2 x){return rsqrt(x);}
+  AH3 ARsqH3(AH3 x){return rsqrt(x);}
+  AH4 ARsqH4(AH4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return saturate(x);}
+  AH2 ASatH2(AH2 x){return saturate(x);}
+  AH3 ASatH3(AH3 x){return saturate(x);}
+  AH4 ASatH4(AH4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         HLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #ifdef A_HLSL_6_2
+   #define AD1 float64_t
+   #define AD2 float64_t2
+   #define AD3 float64_t3
+   #define AD4 float64_t4
+  #else
+   #define AD1 double
+   #define AD2 double2
+   #define AD3 double3
+   #define AD4 double4
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 a){return a-floor(a);}
+  AD2 AFractD2(AD2 a){return a-floor(a);}
+  AD3 AFractD3(AD3 a){return a-floor(a);}
+  AD4 AFractD4(AD4 a){return a-floor(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return rcp(x);}
+  AD2 ARcpD2(AD2 x){return rcp(x);}
+  AD3 ARcpD3(AD3 x){return rcp(x);}
+  AD4 ARcpD4(AD4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return rsqrt(x);}
+  AD2 ARsqD2(AD2 x){return rsqrt(x);}
+  AD3 ARsqD3(AD3 x){return rsqrt(x);}
+  AD4 ARsqD4(AD4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return saturate(x);}
+  AD2 ASatD2(AD2 x){return saturate(x);}
+  AD3 ASatD3(AD3 x){return saturate(x);}
+  AD4 ASatD4(AD4 x){return saturate(x);}
+ #endif
+//==============================================================================================================================
+//                                                         HLSL WAVE
+//==============================================================================================================================
+ #ifdef A_WAVE
+  // Where 'x' must be a compile time literal.
+  AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_HALF
+   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
+   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
+   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
+   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
+  #endif
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                          GPU COMMON
+//
+//
+//==============================================================================================================================
+#ifdef A_GPU
+ // Negative and positive infinity.
+ #define A_INFP_F AF1_AU1(0x7f800000u)
+ #define A_INFN_F AF1_AU1(0xff800000u)
+//------------------------------------------------------------------------------------------------------------------------------
+ // Copy sign from 's' to positive 'd'.
+ AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
+ AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
+ AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
+ AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Single operation to return (useful to create a mask to use in lerp for branch free logic),
+ //  m=NaN := 0
+ //  m>=0  := 0
+ //  m<0   := 1
+ // Uses the following useful floating point logic,
+ //  saturate(+a*(-INF)==-INF) := 0
+ //  saturate( 0*(-INF)== NaN) := 0
+ //  saturate(-a*(-INF)==+INF) := 1
+ AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
+ AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
+ AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
+ AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
+ AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
+ AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
+ AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  #ifdef A_HLSL_6_2
+   #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
+   #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
+  #else
+   #define A_INFP_H AH1_AW1(0x7c00u)
+   #define A_INFN_H AH1_AW1(0xfc00u)
+  #endif
+
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
+  AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
+  AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
+  AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
+  AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
+  AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
+  AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
+  AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
+  AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
+  AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                [FIS] FLOAT INTEGER SORTABLE
+//------------------------------------------------------------------------------------------------------------------------------
+// Float to integer sortable.
+//  - If sign bit=0, flip the sign bit (positives).
+//  - If sign bit=1, flip all bits     (negatives).
+// Integer sortable to float.
+//  - If sign bit=1, flip the sign bit (positives).
+//  - If sign bit=0, flip all bits     (negatives).
+// Has nice side effects.
+//  - Larger integers are more positive values.
+//  - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
+// Burns 3 ops for conversion {shift,or,xor}.
+//==============================================================================================================================
+ AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
+ AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
+ AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
+ AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
+  AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
+  AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      [PERM] V_PERM_B32
+//------------------------------------------------------------------------------------------------------------------------------
+// Support for V_PERM_B32 started in the 3rd generation of GCN.
+//------------------------------------------------------------------------------------------------------------------------------
+// yyyyxxxx - The 'i' input.
+// 76543210
+// ========
+// HGFEDCBA - Naming on permutation.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure compiler optimizes this.
+//==============================================================================================================================
+ #ifdef A_HALF
+  AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
+  AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
+  AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
+  AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
+  AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
+  AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
+  AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
+  AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
+  AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
+  AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
+  AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
+  AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               [BUC] BYTE UNSIGNED CONVERSION
+//------------------------------------------------------------------------------------------------------------------------------
+// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
+// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
+//------------------------------------------------------------------------------------------------------------------------------
+// OPCODE NOTES
+// ============
+// GCN does not do UNORM or SNORM for bytes in opcodes.
+//  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
+//  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
+// V_PERM_B32 does byte packing with ability to zero fill bytes as well.
+//  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
+// ====   =====
+//    0 : 0
+//    1 : 1
+//     ...
+//  255 : 255
+//      : 256 (just outside the encoding range)
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
+// ====   =====
+//    0 : 0
+//    1 : 1/512
+//    2 : 1/256
+//     ...
+//   64 : 1/8
+//  128 : 1/4
+//  255 : 255/512
+//      : 1/2 (just outside the encoding range)
+//------------------------------------------------------------------------------------------------------------------------------
+// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
+// ============================================
+// r=ABuc0FromU1(i)
+//   V_CVT_F32_UBYTE0 r,i
+// --------------------------------------------
+// r=ABuc0ToU1(d,i)
+//   V_CVT_PKACCUM_U8_F32 r,i,0,d
+// --------------------------------------------
+// d=ABuc0FromU2(i)
+//   Where 'k0' is an SGPR with 0x0E0A
+//   Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
+//   V_PERM_B32 d,i.x,i.y,k0
+//   V_PK_FMA_F16 d,d,k1.x,0
+// --------------------------------------------
+// r=ABuc0ToU2(d,i)
+//   Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
+//   Where 'k1' is an SGPR with 0x????
+//   Where 'k2' is an SGPR with 0x????
+//   V_PK_FMA_F16 i,i,k0.x,0
+//   V_PERM_B32 r.x,i,i,k1
+//   V_PERM_B32 r.y,i,i,k2
+//==============================================================================================================================
+ // Peak range for 32-bit and 16-bit operations.
+ #define A_BUC_32 (255.0)
+ #define A_BUC_16 (255.0/512.0)
+//==============================================================================================================================
+ #if 1
+  // Designed to be one V_CVT_PKACCUM_U8_F32.
+  // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
+  AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u)    )&(0x000000ffu));}
+  AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
+  AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
+  AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed to be one V_CVT_F32_UBYTE*.
+  AF1 ABuc0FromU1(AU1 i){return AF1((i    )&255u);}
+  AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
+  AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
+  AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
+  AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
+   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed for 3 ops to do SOA to AOS and conversion.
+  AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed for 2 ops to do both AOS to SOA, and conversion.
+  AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
+  AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
+  AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
+  AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                 [BSC] BYTE SIGNED CONVERSION
+//------------------------------------------------------------------------------------------------------------------------------
+// Similar to [BUC].
+// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
+//------------------------------------------------------------------------------------------------------------------------------
+// ENCODING (without zero-based encoding)
+// ========
+//   0 = unused (can be used to mean something else)
+//   1 = lowest value 
+// 128 = exact zero center (zero based encoding 
+// 255 = highest value
+//------------------------------------------------------------------------------------------------------------------------------
+// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
+// This is useful if there is a desire for cleared values to decode as zero.
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
+// ====   =====
+//    0 : -127/512 (unused)
+//    1 : -126/512
+//    2 : -125/512
+//     ...
+//  128 : 0 
+//     ... 
+//  255 : 127/512
+//      : 1/4 (just outside the encoding range)
+//==============================================================================================================================
+ // Peak range for 32-bit and 16-bit operations.
+ #define A_BSC_32 (127.0)
+ #define A_BSC_16 (127.0/512.0)
+//==============================================================================================================================
+ #if 1
+  AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u)    )&(0x000000ffu));}
+  AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
+  AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
+  AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u)    )&(0x000000ffu)))^0x00000080u;}
+  AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
+  AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
+  AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ABsc0FromU1(AU1 i){return AF1((i    )&255u)-128.0;}
+  AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
+  AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
+  AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ABsc0FromZbU1(AU1 i){return AF1(((i    )&255u)^0x80u)-128.0;}
+  AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
+  AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
+  AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
+  AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
+   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These support only positive inputs.
+// Did not see value yet in specialization for range.
+// Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
+// With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
+// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
+// And co-execution would require a compiler interleaving a lot of independent work for packed usage.
+//------------------------------------------------------------------------------------------------------------------------------
+// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
+// Same with sqrt(), as this could be x*rsq() (7 ops).
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Minimize squared error across full positive range, 2 ops.
+  // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
+  AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
+  AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
+  AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
+  AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Lower precision estimation, 1 op.
+  // Minimize squared error across {smallest normal to 16384.0}.
+  AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
+  AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
+  AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
+  AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Medium precision estimation, one Newton Raphson iteration, 3 ops.
+  AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
+  AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
+  AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
+  AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
+  AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
+  AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
+  AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
+  AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    FLOAT APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
+//  - Idea dates back to SGI, then to Quake 3, etc.
+//  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
+//     - sqrt(x)=rsqrt(x)*x
+//     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
+//  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
+//------------------------------------------------------------------------------------------------------------------------------
+// These below are from perhaps less complete searching for optimal.
+// Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
+// So these match up well with the half approximations.
+//==============================================================================================================================
+ AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
+ AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
+ AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
+ AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
+ AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
+ AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
+ AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
+ AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
+ AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
+ AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
+ AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
+ AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
+ AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PQ APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
+// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
+//==============================================================================================================================
+// Helpers
+ AF1 Quart(AF1 a) { a = a * a; return a * a;}
+ AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
+ AF2 Quart(AF2 a) { a = a * a; return a * a; }
+ AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
+ AF3 Quart(AF3 a) { a = a * a; return a * a; }
+ AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
+ AF4 Quart(AF4 a) { a = a * a; return a * a; }
+ AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
+ AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
+ AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
+ AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
+ AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
+ AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
+ AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
+ AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
+ AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
+ AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
+ AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
+ AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
+ AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
+ AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
+ AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
+ AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
+ AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
+ AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
+ AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
+ AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
+ AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PARABOLIC SIN & COS
+//------------------------------------------------------------------------------------------------------------------------------
+// Approximate answers to transcendental questions.
+//------------------------------------------------------------------------------------------------------------------------------
+//==============================================================================================================================
+ #if 1
+  // Valid input range is {-1 to 1} representing {0 to 2 pi}.
+  // Output range is {-1/4 to 1/4} representing {-1 to 1}.
+  AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
+  AF2 APSinF2(AF2 x){return x*abs(x)-x;}
+  AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
+  AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
+  AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  // For a packed {sin,cos} pair,
+  //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
+  //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
+  AH1 APSinH1(AH1 x){return x*abs(x)-x;}
+  AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
+  AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
+  AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
+  AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     [ZOL] ZERO ONE LOGIC
+//------------------------------------------------------------------------------------------------------------------------------
+// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
+//------------------------------------------------------------------------------------------------------------------------------
+// 0 := false
+// 1 := true
+//------------------------------------------------------------------------------------------------------------------------------
+// AndNot(x,y)   -> !(x&y) .... One op.
+// AndOr(x,y,z)  -> (x&y)|z ... One op.
+// GtZero(x)     -> x>0.0 ..... One op.
+// Sel(x,y,z)    -> x?y:z ..... Two ops, has no precision loss.
+// Signed(x)     -> x<0.0 ..... One op.
+// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
+//------------------------------------------------------------------------------------------------------------------------------
+// OPTIMIZATION NOTES
+// ==================
+// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
+//   For example 'a.xy*k.xx+k.yy'.
+//==============================================================================================================================
+ #if 1
+  AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
+  AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
+  AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
+  AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
+  AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
+  AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
+  AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
+  AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
+  AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
+  AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
+//==============================================================================================================================
+  AU1 AZolF1ToU1(AF1 x){return AU1(x);}
+  AU2 AZolF2ToU2(AF2 x){return AU2(x);}
+  AU3 AZolF3ToU3(AF3 x){return AU3(x);}
+  AU4 AZolF4ToU4(AF4 x){return AU4(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
+  AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
+  AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
+  AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
+  AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolU1ToF1(AU1 x){return AF1(x);}
+  AF2 AZolU2ToF2(AU2 x){return AF2(x);}
+  AF3 AZolU3ToF3(AU3 x){return AF3(x);}
+  AF4 AZolU4ToF4(AU4 x){return AF4(x);}
+//==============================================================================================================================
+  AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
+  AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
+  AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
+  AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
+  AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
+  AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
+  AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
+  AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
+  AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
+  AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
+  AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
+  AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
+  AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
+  AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
+  AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
+  AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
+  AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
+  AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
+  AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
+  AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
+  AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
+  AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
+  AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
+  AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
+  AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
+  AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
+  AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
+  AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
+  AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
+  AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
+  AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
+  AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
+  AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
+  AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
+  AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
+  AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
+  AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
+//==============================================================================================================================
+  // Uses denormal trick.
+  AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
+  AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
+  AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
+  AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // AMD arch lacks a packed conversion opcode.
+  AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
+  AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
+  AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
+  AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
+//==============================================================================================================================
+  AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
+  AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
+  AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
+  AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
+  AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
+  AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
+  AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
+  AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
+  AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
+  AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
+  AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
+  AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
+  AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
+  AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
+  AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
+  AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
+  AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
+  AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
+  AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
+  AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
+  AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
+  AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
+  AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
+  AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
+  AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      COLOR CONVERSIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
+// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
+// These are branch free implementations.
+// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
+//------------------------------------------------------------------------------------------------------------------------------
+// TRANSFER FUNCTIONS
+// ==================
+// 709 ..... Rec709 used for some HDTVs
+// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
+// Pq ...... PQ native for HDR10
+// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
+// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
+// Three ... Gamma 3.0, less fast, but good for HDR.
+//------------------------------------------------------------------------------------------------------------------------------
+// KEEPING TO SPEC
+// ===============
+// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
+//  (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
+//  (b.) For 8-bit  709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
+// Also there is a slight step in the transition regions.
+// Precision of the coefficients in the spec being the likely cause.
+// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
+// This is to work around lack of hardware (typically only ROP does the conversion for free).
+// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
+// So this header keeps with the spec.
+// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
+// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
+//------------------------------------------------------------------------------------------------------------------------------
+// FOR PQ
+// ======
+// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
+// All constants are only specified to FP32 precision.
+// External PQ source reference,
+//  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
+//------------------------------------------------------------------------------------------------------------------------------
+// PACKED VERSIONS
+// ===============
+// These are the A*H2() functions.
+// There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
+// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
+// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
+//------------------------------------------------------------------------------------------------------------------------------
+// NOTES
+// =====
+// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
+//==============================================================================================================================
+ #if 1
+  AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
+  AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
+  AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
+  AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
+   return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
+  AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
+   return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
+  AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
+   return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToTwoF1(AF1 c){return sqrt(c);}
+  AF2 AToTwoF2(AF2 c){return sqrt(c);}
+  AF3 AToTwoF3(AF3 c){return sqrt(c);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
+  AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
+  AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
+ #endif
+//==============================================================================================================================
+ #if 1
+  // Unfortunately median won't work here.
+  AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
+  AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
+  AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
+   return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
+  AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
+   return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
+  AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
+   return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Unfortunately median won't work here.
+  AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromTwoF1(AF1 c){return c*c;}
+  AF2 AFromTwoF2(AF2 c){return c*c;}
+  AF3 AFromTwoF3(AF3 c){return c*c;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromThreeF1(AF1 c){return c*c*c;}
+  AF2 AFromThreeF2(AF2 c){return c*c*c;}
+  AF3 AFromThreeF3(AF3 c){return c*c*c;}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
+  AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
+  AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToTwoH1(AH1 c){return sqrt(c);}
+  AH2 AToTwoH2(AH2 c){return sqrt(c);}
+  AH3 AToTwoH3(AH3 c){return sqrt(c);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
+  AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
+  AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
+  AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
+  AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromTwoH1(AH1 c){return c*c;}
+  AH2 AFromTwoH2(AH2 c){return c*c;}
+  AH3 AFromTwoH3(AH3 c){return c*c;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromThreeH1(AH1 c){return c*c*c;}
+  AH2 AFromThreeH2(AH2 c){return c*c*c;}
+  AH3 AFromThreeH3(AH3 c){return c*c*c;}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          CS REMAP
+//==============================================================================================================================
+ // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
+ //  543210
+ //  ======
+ //  ..xxx.
+ //  yy...y
+ AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+//==============================================================================================================================
+ // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
+ //  543210
+ //  ======
+ //  .xx..x
+ //  y..yy.
+ // Details,
+ //  LANE TO 8x8 MAPPING
+ //  ===================
+ //  00 01 08 09 10 11 18 19 
+ //  02 03 0a 0b 12 13 1a 1b
+ //  04 05 0c 0d 14 15 1c 1d
+ //  06 07 0e 0f 16 17 1e 1f 
+ //  20 21 28 29 30 31 38 39 
+ //  22 23 2a 2b 32 33 3a 3b
+ //  24 25 2c 2d 34 35 3c 3d
+ //  26 27 2e 2f 36 37 3e 3f 
+ AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+  AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+ #endif
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                          REFERENCE
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// IEEE FLOAT RULES
+// ================
+//  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
+//  - {+/-}0 * {+/-}INF = NaN
+//  - -INF + (+INF) = NaN
+//  - {+/-}0 / {+/-}0 = NaN
+//  - {+/-}INF / {+/-}INF = NaN
+//  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
+//  - 0 == -0
+//  - 4/0 = +INF
+//  - 4/-0 = -INF
+//  - 4+INF = +INF
+//  - 4-INF = -INF
+//  - 4*(+INF) = +INF
+//  - 4*(-INF) = -INF
+//  - -4*(+INF) = -INF
+//  - sqrt(+INF) = +INF
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16 ENCODING
+// =============
+// fedcba9876543210
+// ----------------
+// ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
+// .eeeee..........  5-bit exponent
+// .00000..........  denormals
+// .00001..........  -14 exponent
+// .11110..........   15 exponent
+// .111110000000000  infinity
+// .11111nnnnnnnnnn  NaN with n!=0
+// s...............  sign
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16/INT16 ALIASING DENORMAL
+// ============================
+// 11-bit unsigned integers alias with half float denormal/normal values,
+//     1 = 2^(-24) = 1/16777216 ....................... first denormal value
+//     2 = 2^(-23)
+//   ...
+//  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
+//  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
+//  2047 .............................................. last normal value that still maps to integers 
+// Scaling limits,
+//  2^15 = 32768 ...................................... largest power of 2 scaling
+// Largest pow2 conversion mapping is at *32768,
+//     1 : 2^(-9) = 1/512
+//     2 : 1/256
+//     4 : 1/128
+//     8 : 1/64
+//    16 : 1/32
+//    32 : 1/16
+//    64 : 1/8
+//   128 : 1/4
+//   256 : 1/2
+//   512 : 1
+//  1024 : 2
+//  2047 : a little less than 4
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                     GPU/CPU PORTABILITY
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This is the GPU implementation.
+// See the CPU implementation for docs.
+//==============================================================================================================================
+#ifdef A_GPU
+ #define A_TRUE true
+ #define A_FALSE false
+ #define A_STATIC
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD2
+ #define retAD3 AD3
+ #define retAD4 AD4
+ #define retAF2 AF2
+ #define retAF3 AF3
+ #define retAF4 AF4
+ #define retAL2 AL2
+ #define retAL3 AL3
+ #define retAL4 AL4
+ #define retAU2 AU2
+ #define retAU3 AU3
+ #define retAU4 AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 in AD2
+ #define inAD3 in AD3
+ #define inAD4 in AD4
+ #define inAF2 in AF2
+ #define inAF3 in AF3
+ #define inAF4 in AF4
+ #define inAL2 in AL2
+ #define inAL3 in AL3
+ #define inAL4 in AL4
+ #define inAU2 in AU2
+ #define inAU3 in AU3
+ #define inAU4 in AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 inout AD2
+ #define inoutAD3 inout AD3
+ #define inoutAD4 inout AD4
+ #define inoutAF2 inout AF2
+ #define inoutAF3 inout AF3
+ #define inoutAF4 inout AF4
+ #define inoutAL2 inout AL2
+ #define inoutAL3 inout AL3
+ #define inoutAL4 inout AL4
+ #define inoutAU2 inout AU2
+ #define inoutAU3 inout AU3
+ #define inoutAU4 inout AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 out AD2
+ #define outAD3 out AD3
+ #define outAD4 out AD4
+ #define outAF2 out AF2
+ #define outAF3 out AF3
+ #define outAF4 out AF4
+ #define outAL2 out AL2
+ #define outAL3 out AL3
+ #define outAL4 out AL4
+ #define outAU2 out AU2
+ #define outAU3 out AU3
+ #define outAU4 out AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD2 x
+ #define varAD3(x) AD3 x
+ #define varAD4(x) AD4 x
+ #define varAF2(x) AF2 x
+ #define varAF3(x) AF3 x
+ #define varAF4(x) AF4 x
+ #define varAL2(x) AL2 x
+ #define varAL3(x) AL3 x
+ #define varAL4(x) AL4 x
+ #define varAU2(x) AU2 x
+ #define varAU3(x) AU3 x
+ #define varAU4(x) AU4 x
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) AD2(x,y)
+ #define initAD3(x,y,z) AD3(x,y,z)
+ #define initAD4(x,y,z,w) AD4(x,y,z,w)
+ #define initAF2(x,y) AF2(x,y)
+ #define initAF3(x,y,z) AF3(x,y,z)
+ #define initAF4(x,y,z,w) AF4(x,y,z,w)
+ #define initAL2(x,y) AL2(x,y)
+ #define initAL3(x,y,z) AL3(x,y,z)
+ #define initAL4(x,y,z,w) AL4(x,y,z,w)
+ #define initAU2(x,y) AU2(x,y)
+ #define initAU3(x,y,z) AU3(x,y,z)
+ #define initAU4(x,y,z,w) AU4(x,y,z,w)
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//==============================================================================================================================
+ #define AAbsD1(a) abs(AD1(a))
+ #define AAbsF1(a) abs(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ACosD1(a) cos(AD1(a))
+ #define ACosF1(a) cos(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ADotD2(a,b) dot(AD2(a),AD2(b))
+ #define ADotD3(a,b) dot(AD3(a),AD3(b))
+ #define ADotD4(a,b) dot(AD4(a),AD4(b))
+ #define ADotF2(a,b) dot(AF2(a),AF2(b))
+ #define ADotF3(a,b) dot(AF3(a),AF3(b))
+ #define ADotF4(a,b) dot(AF4(a),AF4(b))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AExp2D1(a) exp2(AD1(a))
+ #define AExp2F1(a) exp2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AFloorD1(a) floor(AD1(a))
+ #define AFloorF1(a) floor(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ALog2D1(a) log2(AD1(a))
+ #define ALog2F1(a) log2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMaxD1(a,b) max(a,b)
+ #define AMaxF1(a,b) max(a,b)
+ #define AMaxL1(a,b) max(a,b)
+ #define AMaxU1(a,b) max(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMinD1(a,b) min(a,b)
+ #define AMinF1(a,b) min(a,b)
+ #define AMinL1(a,b) min(a,b)
+ #define AMinU1(a,b) min(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASinD1(a) sin(AD1(a))
+ #define ASinF1(a) sin(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASqrtD1(a) sqrt(AD1(a))
+ #define ASqrtF1(a) sqrt(AF1(a))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ #define APowD1(a,b) pow(AD1(a),AF1(b))
+ #define APowF1(a,b) pow(AF1(a),AF1(b))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ #ifdef A_DUBL
+  AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
+  AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
+  AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
+  AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
+  AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
+  AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
+  AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
+  AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
+  AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
+  AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
+  AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
+  AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
+  AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
+  AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
+  AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
+  AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
+  AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
+  AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
+  AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
+  AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
+  AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
+  AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
+  AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
+  AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
+  AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
+ #endif
+//==============================================================================================================================
+ AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
+ AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
+ AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
+ AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
+ AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
+ AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
+ AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
+ AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
+ AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
+ AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
+ AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
+ AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
+ AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
+ AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
+ AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
+ AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
+ AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
+ AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
+ AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
+ AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
+ AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
+ AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
+ AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
+ AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
+ AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
+#endif
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h
new file mode 100644
index 0000000000..4e0b3d5485
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h
@@ -0,0 +1,1199 @@
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                    AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// FidelityFX Super Resolution Sample
+//
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// FSR is a collection of algorithms relating to generating a higher resolution image.
+// This specific header focuses on single-image non-temporal image scaling, and related tools.
+// 
+// The core functions are EASU and RCAS:
+//  [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter.
+//  [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS.
+// RCAS needs to be applied after EASU as a separate pass.
+// 
+// Optional utility functions are:
+//  [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling.
+//  [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back.
+//  [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
+// See each individual sub-section for inline documentation.
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// FUNCTION PERMUTATIONS
+// =====================
+// *F() ..... Single item computation with 32-bit.
+// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible.
+// *Hx2() ... Processing two items in parallel with 16-bit, easier packing.
+//            Not all interfaces in this file have a *Hx2() form.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                        FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// EASU provides a high quality spatial-only scaling at relatively low cost.
+// Meaning EASU is appropiate for laptops and other low-end GPUs.
+// Quality from 1x to 4x area scaling is good.
+//------------------------------------------------------------------------------------------------------------------------------
+// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel.
+// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos.
+// This is also kept as simple as possible to have minimum runtime.
+//------------------------------------------------------------------------------------------------------------------------------
+// The lanzcos filter has negative lobes, so by itself it will introduce ringing.
+// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood,
+// and limits output to the minimum and maximum of that neighborhood.
+//------------------------------------------------------------------------------------------------------------------------------
+// Input image requirements:
+// 
+// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported)
+// Each channel needs to be in the range[0, 1]
+// Any color primaries are supported
+// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0)
+// There should be no banding in the input
+// There should be no high amplitude noise in the input
+// There should be no noise in the input that is not at input pixel granularity
+// For performance purposes, use 32bpp formats
+//------------------------------------------------------------------------------------------------------------------------------
+// Best to apply EASU at the end of the frame after tonemapping 
+// but before film grain or composite of the UI.
+//------------------------------------------------------------------------------------------------------------------------------
+// Example of including this header for D3D HLSL :
+// 
+//  #define A_GPU 1
+//  #define A_HLSL 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+// Example of including this header for Vulkan GLSL :
+// 
+//  #define A_GPU 1
+//  #define A_GLSL 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+// Example of including this header for Vulkan HLSL :
+// 
+//  #define A_GPU 1
+//  #define A_HLSL 1
+//  #define A_HLSL_6_2 1
+//  #define A_NO_16_BIT_CAST 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+//  Example of declaring the required input callbacks for GLSL :
+//  The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'.
+//  EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion.
+// 
+//  AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));}
+//  AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));}
+//  AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));}
+//  ...
+//  The FsrEasuCon function needs to be called from the CPU or GPU to set up constants.
+//  The difference in viewport and input image size is there to support Dynamic Resolution Scaling.
+//  To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1.
+//  Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer.
+//  AU4 con0,con1,con2,con3;
+//  FsrEasuCon(con0,con1,con2,con3,
+//    1920.0,1080.0,  // Viewport size (top left aligned) in the input image which is to be scaled.
+//    3840.0,2160.0,  // The size of the input image.
+//    2560.0,1440.0); // The output resolution.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      CONSTANT SETUP
+//==============================================================================================================================
+// Call to setup required constant values (works on CPU or GPU).
+A_STATIC void FsrEasuCon(
+outAU4 con0,
+outAU4 con1,
+outAU4 con2,
+outAU4 con3,
+// This the rendered image resolution being upscaled
+AF1 inputViewportInPixelsX,
+AF1 inputViewportInPixelsY,
+// This is the resolution of the resource containing the input image (useful for dynamic resolution)
+AF1 inputSizeInPixelsX,
+AF1 inputSizeInPixelsY,
+// This is the display resolution which the input image gets upscaled to
+AF1 outputSizeInPixelsX,
+AF1 outputSizeInPixelsY){
+ // Output integer position to a pixel position in viewport.
+ con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
+ con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
+ con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
+ con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
+ // Viewport pixel position to normalized image space.
+ // This is used to get upper-left of 'F' tap.
+ con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
+ con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
+ // Centers of gather4, first offset from upper-left of 'F'.
+ //      +---+---+
+ //      |   |   |
+ //      +--(0)--+
+ //      | b | c |
+ //  +---F---+---+---+
+ //  | e | f | g | h |
+ //  +--(1)--+--(2)--+
+ //  | i | j | k | l |
+ //  +---+---+---+---+
+ //      | n | o |
+ //      +--(3)--+
+ //      |   |   |
+ //      +---+---+
+ con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
+ con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY));
+ // These are from (0) instead of 'F'.
+ con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX));
+ con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
+ con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
+ con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
+ con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX));
+ con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY));
+ con3[2]=con3[3]=0;}
+
+//If the an offset into the input image resource
+A_STATIC void FsrEasuConOffset(
+    outAU4 con0,
+    outAU4 con1,
+    outAU4 con2,
+    outAU4 con3,
+    // This the rendered image resolution being upscaled
+    AF1 inputViewportInPixelsX,
+    AF1 inputViewportInPixelsY,
+    // This is the resolution of the resource containing the input image (useful for dynamic resolution)
+    AF1 inputSizeInPixelsX,
+    AF1 inputSizeInPixelsY,
+    // This is the display resolution which the input image gets upscaled to
+    AF1 outputSizeInPixelsX,
+    AF1 outputSizeInPixelsY,
+    // This is the input image offset into the resource containing it (useful for dynamic resolution)
+    AF1 inputOffsetInPixelsX,
+    AF1 inputOffsetInPixelsY) {
+    FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
+    con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX);
+    con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                   NON-PACKED 32-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(FSR_EASU_F)
+ // Input callback prototypes, need to be implemented by calling shader
+ AF4 FsrEasuRF(AF2 p);
+ AF4 FsrEasuGF(AF2 p);
+ AF4 FsrEasuBF(AF2 p);
+//------------------------------------------------------------------------------------------------------------------------------
+ // Filtering for a given tap for the scalar.
+ void FsrEasuTapF(
+ inout AF3 aC, // Accumulated color, with negative lobe.
+ inout AF1 aW, // Accumulated weight.
+ AF2 off, // Pixel offset from resolve position to tap.
+ AF2 dir, // Gradient direction.
+ AF2 len, // Length.
+ AF1 lob, // Negative lobe strength.
+ AF1 clp, // Clipping point.
+ AF3 c){ // Tap color.
+  // Rotate offset by direction.
+  AF2 v;
+  v.x=(off.x*( dir.x))+(off.y*dir.y);
+  v.y=(off.x*(-dir.y))+(off.y*dir.x);
+  // Anisotropy.
+  v*=len;
+  // Compute distance^2.
+  AF1 d2=v.x*v.x+v.y*v.y;
+  // Limit to the window as at corner, 2 taps can easily be outside.
+  d2=min(d2,clp);
+  // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
+  //  (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
+  //  |_______________________________________|   |_______________|
+  //                   base                             window
+  // The general form of the 'base' is,
+  //  (a*(b*x^2-1)^2-(a-1))
+  // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
+  AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0);
+  AF1 wA=lob*d2+AF1_(-1.0);
+  wB*=wB;
+  wA*=wA;
+  wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0));
+  AF1 w=wB*wA;
+  // Do weighted average.
+  aC+=c*w;aW+=w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Accumulate direction and length.
+ void FsrEasuSetF(
+ inout AF2 dir,
+ inout AF1 len,
+ AF2 pp,
+ AP1 biS,AP1 biT,AP1 biU,AP1 biV,
+ AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){
+  // Compute bilinear weight, branches factor out as predicates are compiler time immediates.
+  //  s t
+  //  u v
+  AF1 w = AF1_(0.0);
+  if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
+  if(biT)w=           pp.x *(AF1_(1.0)-pp.y);
+  if(biU)w=(AF1_(1.0)-pp.x)*           pp.y ;
+  if(biV)w=           pp.x *           pp.y ;
+  // Direction is the '+' diff.
+  //    a
+  //  b c d
+  //    e
+  // Then takes magnitude from abs average of both sides of 'c'.
+  // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
+  AF1 dc=lD-lC;
+  AF1 cb=lC-lB;
+  AF1 lenX=max(abs(dc),abs(cb));
+  lenX=APrxLoRcpF1(lenX);
+  AF1 dirX=lD-lB;
+  dir.x+=dirX*w;
+  lenX=ASatF1(abs(dirX)*lenX);
+  lenX*=lenX;
+  len+=lenX*w;
+  // Repeat for the y axis.
+  AF1 ec=lE-lC;
+  AF1 ca=lC-lA;
+  AF1 lenY=max(abs(ec),abs(ca));
+  lenY=APrxLoRcpF1(lenY);
+  AF1 dirY=lE-lA;
+  dir.y+=dirY*w;
+  lenY=ASatF1(abs(dirY)*lenY);
+  lenY*=lenY;
+  len+=lenY*w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrEasuF(
+ out AF3 pix,
+ AU2 ip, // Integer pixel position in output.
+ AU4 con0, // Constants generated by FsrEasuCon().
+ AU4 con1,
+ AU4 con2,
+ AU4 con3){
+//------------------------------------------------------------------------------------------------------------------------------
+  // Get position of 'f'.
+  AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
+  AF2 fp=floor(pp);
+  pp-=fp;
+//------------------------------------------------------------------------------------------------------------------------------
+  // 12-tap kernel.
+  //    b c
+  //  e f g h
+  //  i j k l
+  //    n o
+  // Gather 4 ordering.
+  //  a b
+  //  r g
+  // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
+  //    a b    <- unused (z)
+  //    r g
+  //  a b a b
+  //  r g r g
+  //    a b
+  //    r g    <- unused (z)
+  // Allowing dead-code removal to remove the 'z's.
+  AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
+  // These are from p0 to avoid pulling two constants on pre-Navi hardware.
+  AF2 p1=p0+AF2_AU2(con2.xy);
+  AF2 p2=p0+AF2_AU2(con2.zw);
+  AF2 p3=p0+AF2_AU2(con3.xy);
+  AF4 bczzR=FsrEasuRF(p0);
+  AF4 bczzG=FsrEasuGF(p0);
+  AF4 bczzB=FsrEasuBF(p0);
+  AF4 ijfeR=FsrEasuRF(p1);
+  AF4 ijfeG=FsrEasuGF(p1);
+  AF4 ijfeB=FsrEasuBF(p1);
+  AF4 klhgR=FsrEasuRF(p2);
+  AF4 klhgG=FsrEasuGF(p2);
+  AF4 klhgB=FsrEasuBF(p2);
+  AF4 zzonR=FsrEasuRF(p3);
+  AF4 zzonG=FsrEasuGF(p3);
+  AF4 zzonB=FsrEasuBF(p3);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
+  AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG);
+  AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG);
+  AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG);
+  AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG);
+  // Rename.
+  AF1 bL=bczzL.x;
+  AF1 cL=bczzL.y;
+  AF1 iL=ijfeL.x;
+  AF1 jL=ijfeL.y;
+  AF1 fL=ijfeL.z;
+  AF1 eL=ijfeL.w;
+  AF1 kL=klhgL.x;
+  AF1 lL=klhgL.y;
+  AF1 hL=klhgL.z;
+  AF1 gL=klhgL.w;
+  AF1 oL=zzonL.z;
+  AF1 nL=zzonL.w;
+  // Accumulate for bilinear interpolation.
+  AF2 dir=AF2_(0.0);
+  AF1 len=AF1_(0.0);
+  FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
+  FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
+  FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
+  FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Normalize with approximation, and cleanup close to zero.
+  AF2 dir2=dir*dir;
+  AF1 dirR=dir2.x+dir2.y;
+  AP1 zro=dirR<AF1_(1.0/32768.0);
+  dirR=APrxLoRsqF1(dirR);
+  dirR=zro?AF1_(1.0):dirR;
+  dir.x=zro?AF1_(1.0):dir.x;
+  dir*=AF2_(dirR);
+  // Transform from {0 to 2} to {0 to 1} range, and shape with square.
+  len=len*AF1_(0.5);
+  len*=len;
+  // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
+  AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y)));
+  // Anisotropic length after rotation,
+  //  x := 1.0 lerp to 'stretch' on edges
+  //  y := 1.0 lerp to 2x on edges
+  AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len);
+  // Based on the amount of 'edge',
+  // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
+  AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len;
+  // Set distance^2 clipping point to the end of the adjustable window.
+  AF1 clp=APrxLoRcpF1(lob);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Accumulation mixed with min/max of 4 nearest.
+  //    b c
+  //  e f g h
+  //  i j k l
+  //    n o
+  AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
+               AF3(klhgR.x,klhgG.x,klhgB.x));
+  AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
+               AF3(klhgR.x,klhgG.x,klhgB.x));
+  // Accumulation.
+  AF3 aC=AF3_(0.0);
+  AF1 aW=AF1_(0.0);
+  FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b
+  FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c
+  FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i
+  FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j
+  FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f
+  FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e
+  FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k
+  FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l
+  FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h
+  FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g
+  FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o
+  FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n
+//------------------------------------------------------------------------------------------------------------------------------
+  // Normalize and dering.
+  pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H)
+// Input callback prototypes, need to be implemented by calling shader
+ AH4 FsrEasuRH(AF2 p);
+ AH4 FsrEasuGH(AF2 p);
+ AH4 FsrEasuBH(AF2 p);
+//------------------------------------------------------------------------------------------------------------------------------
+ // This runs 2 taps in parallel.
+ void FsrEasuTapH(
+ inout AH2 aCR,inout AH2 aCG,inout AH2 aCB,
+ inout AH2 aW,
+ AH2 offX,AH2 offY,
+ AH2 dir,
+ AH2 len,
+ AH1 lob,
+ AH1 clp,
+ AH2 cR,AH2 cG,AH2 cB){
+  AH2 vX,vY;
+  vX=offX*  dir.xx +offY*dir.yy;
+  vY=offX*(-dir.yy)+offY*dir.xx;
+  vX*=len.x;vY*=len.y;
+  AH2 d2=vX*vX+vY*vY;
+  d2=min(d2,AH2_(clp));
+  AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0);
+  AH2 wA=AH2_(lob)*d2+AH2_(-1.0);
+  wB*=wB;
+  wA*=wA;
+  wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0));
+  AH2 w=wB*wA;
+  aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This runs 2 taps in parallel.
+ void FsrEasuSetH(
+ inout AH2 dirPX,inout AH2 dirPY,
+ inout AH2 lenP,
+ AH2 pp,
+ AP1 biST,AP1 biUV,
+ AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){
+  AH2 w = AH2_(0.0);
+  if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y);
+  if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(          pp.y);
+  // ABS is not free in the packed FP16 path.
+  AH2 dc=lD-lC;
+  AH2 cb=lC-lB;
+  AH2 lenX=max(abs(dc),abs(cb));
+  lenX=ARcpH2(lenX);
+  AH2 dirX=lD-lB;
+  dirPX+=dirX*w;
+  lenX=ASatH2(abs(dirX)*lenX);
+  lenX*=lenX;
+  lenP+=lenX*w;
+  AH2 ec=lE-lC;
+  AH2 ca=lC-lA;
+  AH2 lenY=max(abs(ec),abs(ca));
+  lenY=ARcpH2(lenY);
+  AH2 dirY=lE-lA;
+  dirPY+=dirY*w;
+  lenY=ASatH2(abs(dirY)*lenY);
+  lenY*=lenY;
+  lenP+=lenY*w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrEasuH(
+ out AH3 pix,
+ AU2 ip,
+ AU4 con0,
+ AU4 con1,
+ AU4 con2,
+ AU4 con3){
+//------------------------------------------------------------------------------------------------------------------------------
+  AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
+  AF2 fp=floor(pp);
+  pp-=fp;
+  AH2 ppp=AH2(pp);
+//------------------------------------------------------------------------------------------------------------------------------
+  AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
+  AF2 p1=p0+AF2_AU2(con2.xy);
+  AF2 p2=p0+AF2_AU2(con2.zw);
+  AF2 p3=p0+AF2_AU2(con3.xy);
+  AH4 bczzR=FsrEasuRH(p0);
+  AH4 bczzG=FsrEasuGH(p0);
+  AH4 bczzB=FsrEasuBH(p0);
+  AH4 ijfeR=FsrEasuRH(p1);
+  AH4 ijfeG=FsrEasuGH(p1);
+  AH4 ijfeB=FsrEasuBH(p1);
+  AH4 klhgR=FsrEasuRH(p2);
+  AH4 klhgG=FsrEasuGH(p2);
+  AH4 klhgB=FsrEasuBH(p2);
+  AH4 zzonR=FsrEasuRH(p3);
+  AH4 zzonG=FsrEasuGH(p3);
+  AH4 zzonB=FsrEasuBH(p3);
+//------------------------------------------------------------------------------------------------------------------------------
+  AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG);
+  AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG);
+  AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG);
+  AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG);
+  AH1 bL=bczzL.x;
+  AH1 cL=bczzL.y;
+  AH1 iL=ijfeL.x;
+  AH1 jL=ijfeL.y;
+  AH1 fL=ijfeL.z;
+  AH1 eL=ijfeL.w;
+  AH1 kL=klhgL.x;
+  AH1 lL=klhgL.y;
+  AH1 hL=klhgL.z;
+  AH1 gL=klhgL.w;
+  AH1 oL=zzonL.z;
+  AH1 nL=zzonL.w;
+  // This part is different, accumulating 2 taps in parallel.
+  AH2 dirPX=AH2_(0.0);
+  AH2 dirPY=AH2_(0.0);
+  AH2 lenP=AH2_(0.0);
+  FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL));
+  FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL));
+  AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g);
+  AH1 len=lenP.r+lenP.g;
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 dir2=dir*dir;
+  AH1 dirR=dir2.x+dir2.y;
+  AP1 zro=dirR<AH1_(1.0/32768.0);
+  dirR=APrxLoRsqH1(dirR);
+  dirR=zro?AH1_(1.0):dirR;
+  dir.x=zro?AH1_(1.0):dir.x;
+  dir*=AH2_(dirR);
+  len=len*AH1_(0.5);
+  len*=len;
+  AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y)));
+  AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len);
+  AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len;
+  AH1 clp=APrxLoRcpH1(lob);
+//------------------------------------------------------------------------------------------------------------------------------
+  // FP16 is different, using packed trick to do min and max in same operation.
+  AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x)));
+  AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x)));
+  AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x)));
+  // This part is different for FP16, working pairs of taps at a time.
+  AH2 pR=AH2_(0.0);
+  AH2 pG=AH2_(0.0);
+  AH2 pB=AH2_(0.0);
+  AH2 pW=AH2_(0.0);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw);
+  AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y);
+  AH1 aW=pW.x+pW.y;
+//------------------------------------------------------------------------------------------------------------------------------
+  // Slightly different for FP16 version due to combined min and max.
+  pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                      FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
+// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
+// RCAS also has a built in process to limit sharpening of what it detects as possible noise.
+// RCAS sharper does not support scaling, as it should be applied after EASU scaling.
+// Pass EASU output straight into RCAS, no color conversions necessary.
+//------------------------------------------------------------------------------------------------------------------------------
+// RCAS is based on the following logic.
+// RCAS uses a 5 tap filter in a cross pattern (same as CAS),
+//    w                n
+//  w 1 w  for taps  w m e 
+//    w                s
+// Where 'w' is the negative lobe weight.
+//  output = (w*(n+e+w+s)+m)/(4*w+1)
+// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
+//  0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
+//  1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
+// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
+// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
+// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
+// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
+// This stabilizes RCAS.
+// RCAS does a simple highpass which is normalized against the local contrast then shaped,
+//       0.25
+//  0.25  -1  0.25
+//       0.25
+// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
+//
+//  GLSL example for the required callbacks :
+// 
+//  AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));}
+//  void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b)
+//  {
+//    //do any simple input color conversions here or leave empty if none needed
+//  }
+//  
+//  FsrRcasCon need to be called from the CPU or GPU to set up constants.
+//  Including a GPU example here, the 'con' value would be stored out to a constant buffer.
+// 
+//  AU4 con;
+//  FsrRcasCon(con,
+//   0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
+// ---------------
+// RCAS sharpening supports a CAS-like pass-through alpha via,
+//  #define FSR_RCAS_PASSTHROUGH_ALPHA 1
+// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise.
+// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define,
+//  #define FSR_RCAS_DENOISE 1
+//==============================================================================================================================
+// This is set at the limit of providing unnatural results for sharpening.
+#define FSR_RCAS_LIMIT (0.25-(1.0/16.0))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      CONSTANT SETUP
+//==============================================================================================================================
+// Call to setup required constant values (works on CPU or GPU).
+A_STATIC void FsrRcasCon(
+outAU4 con,
+// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
+AF1 sharpness){
+ // Transform from stops to linear value.
+ sharpness=AExp2F1(-sharpness);
+ varAF2(hSharp)=initAF2(sharpness,sharpness);
+ con[0]=AU1_AF1(sharpness);
+ con[1]=AU1_AH2_AF2(hSharp);
+ con[2]=0;
+ con[3]=0;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                   NON-PACKED 32-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(FSR_RCAS_F)
+ // Input callback prototypes that need to be implemented by calling shader
+ AF4 FsrRcasLoadF(ASU2 p);
+ void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasF(
+ out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
+ out AF1 pixG,
+ out AF1 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AF1 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // Algorithm uses minimal 3x3 pixel neighborhood.
+  //    b 
+  //  d e f
+  //    h
+  ASU2 sp=ASU2(ip);
+  AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb;
+  AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AF4 ee=FsrRcasLoadF(sp);
+   AF3 e=ee.rgb;pixA=ee.a;
+  #else
+   AF3 e=FsrRcasLoadF(sp).rgb;
+  #endif
+  AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb;
+  AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb;
+  // Rename (32-bit) or regroup (16-bit).
+  AF1 bR=b.r;
+  AF1 bG=b.g;
+  AF1 bB=b.b;
+  AF1 dR=d.r;
+  AF1 dG=d.g;
+  AF1 dB=d.b;
+  AF1 eR=e.r;
+  AF1 eG=e.g;
+  AF1 eB=e.b;
+  AF1 fR=f.r;
+  AF1 fG=f.g;
+  AF1 fB=f.b;
+  AF1 hR=h.r;
+  AF1 hG=h.g;
+  AF1 hB=h.b;
+  // Run optional input transform.
+  FsrRcasInputF(bR,bG,bB);
+  FsrRcasInputF(dR,dG,dB);
+  FsrRcasInputF(eR,eG,eB);
+  FsrRcasInputF(fR,fG,fB);
+  FsrRcasInputF(hR,hG,hB);
+  // Luma times 2.
+  AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG);
+  AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG);
+  AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG);
+  AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG);
+  AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG);
+  // Noise detection.
+  AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL;
+  nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL)));
+  nz=AF1_(-0.5)*nz+AF1_(1.0);
+  // Min and max of ring.
+  AF1 mn4R=min(AMin3F1(bR,dR,fR),hR);
+  AF1 mn4G=min(AMin3F1(bG,dG,fG),hG);
+  AF1 mn4B=min(AMin3F1(bB,dB,fB),hB);
+  AF1 mx4R=max(AMax3F1(bR,dR,fR),hR);
+  AF1 mx4G=max(AMax3F1(bG,dG,fG),hG);
+  AF1 mx4B=max(AMax3F1(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AF2 peakC=AF2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R);
+  AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G);
+  AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B);
+  AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
+  AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
+  AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
+  AF1 lobeR=max(-hitMinR,hitMaxR);
+  AF1 lobeG=max(-hitMinG,hitMaxG);
+  AF1 lobeB=max(-hitMinB,hitMaxB);
+  AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x);
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;
+  return;} 
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                  NON-PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H)
+ // Input callback prototypes that need to be implemented by calling shader
+ AH4 FsrRcasLoadH(ASW2 p);
+ void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasH(
+ out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
+ out AH1 pixG,
+ out AH1 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AH1 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // Sharpening algorithm uses minimal 3x3 pixel neighborhood.
+  //    b 
+  //  d e f
+  //    h
+  ASW2 sp=ASW2(ip);
+  AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb;
+  AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee=FsrRcasLoadH(sp);
+   AH3 e=ee.rgb;pixA=ee.a;
+  #else
+   AH3 e=FsrRcasLoadH(sp).rgb;
+  #endif
+  AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb;
+  AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb;
+  // Rename (32-bit) or regroup (16-bit).
+  AH1 bR=b.r;
+  AH1 bG=b.g;
+  AH1 bB=b.b;
+  AH1 dR=d.r;
+  AH1 dG=d.g;
+  AH1 dB=d.b;
+  AH1 eR=e.r;
+  AH1 eG=e.g;
+  AH1 eB=e.b;
+  AH1 fR=f.r;
+  AH1 fG=f.g;
+  AH1 fB=f.b;
+  AH1 hR=h.r;
+  AH1 hG=h.g;
+  AH1 hB=h.b;
+  // Run optional input transform.
+  FsrRcasInputH(bR,bG,bB);
+  FsrRcasInputH(dR,dG,dB);
+  FsrRcasInputH(eR,eG,eB);
+  FsrRcasInputH(fR,fG,fB);
+  FsrRcasInputH(hR,hG,hB);
+  // Luma times 2.
+  AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG);
+  AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG);
+  AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG);
+  AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG);
+  AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG);
+  // Noise detection.
+  AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL;
+  nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL)));
+  nz=AH1_(-0.5)*nz+AH1_(1.0);
+  // Min and max of ring.
+  AH1 mn4R=min(AMin3H1(bR,dR,fR),hR);
+  AH1 mn4G=min(AMin3H1(bG,dG,fG),hG);
+  AH1 mn4B=min(AMin3H1(bB,dB,fB),hB);
+  AH1 mx4R=max(AMax3H1(bR,dR,fR),hR);
+  AH1 mx4G=max(AMax3H1(bG,dG,fG),hG);
+  AH1 mx4B=max(AMax3H1(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AH2 peakC=AH2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R);
+  AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G);
+  AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B);
+  AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
+  AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
+  AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
+  AH1 lobeR=max(-hitMinR,hitMaxR);
+  AH1 lobeG=max(-hitMinG,hitMaxG);
+  AH1 lobeB=max(-hitMinB,hitMaxB);
+  AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x;
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2)
+ // Input callback prototypes that need to be implemented by the calling shader
+ AH4 FsrRcasLoadHx2(ASW2 p);
+ void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store.
+ void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
+  #ifdef A_HLSL
+   // Invoke a slower path for DX only, since it won't allow uninitialized values.
+   pix0.a=pix1.a=0.0;
+  #endif
+  pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
+  pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasHx2(
+ // Output values are for 2 8x8 tiles in a 16x8 region.
+ //  pix<R,G,B>.x =  left 8x8 tile
+ //  pix<R,G,B>.y = right 8x8 tile
+ // This enables later processing to easily be packed as well.
+ out AH2 pixR,
+ out AH2 pixG,
+ out AH2 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AH2 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // No scaling algorithm uses minimal 3x3 pixel neighborhood.
+  ASW2 sp0=ASW2(ip);
+  AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb;
+  AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee0=FsrRcasLoadHx2(sp0);
+   AH3 e0=ee0.rgb;pixA.r=ee0.a;
+  #else
+   AH3 e0=FsrRcasLoadHx2(sp0).rgb;
+  #endif
+  AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb;
+  AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb;
+  ASW2 sp1=sp0+ASW2(8,0);
+  AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb;
+  AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee1=FsrRcasLoadHx2(sp1);
+   AH3 e1=ee1.rgb;pixA.g=ee1.a;
+  #else
+   AH3 e1=FsrRcasLoadHx2(sp1).rgb;
+  #endif
+  AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb;
+  AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb;
+  // Arrays of Structures to Structures of Arrays conversion.
+  AH2 bR=AH2(b0.r,b1.r);
+  AH2 bG=AH2(b0.g,b1.g);
+  AH2 bB=AH2(b0.b,b1.b);
+  AH2 dR=AH2(d0.r,d1.r);
+  AH2 dG=AH2(d0.g,d1.g);
+  AH2 dB=AH2(d0.b,d1.b);
+  AH2 eR=AH2(e0.r,e1.r);
+  AH2 eG=AH2(e0.g,e1.g);
+  AH2 eB=AH2(e0.b,e1.b);
+  AH2 fR=AH2(f0.r,f1.r);
+  AH2 fG=AH2(f0.g,f1.g);
+  AH2 fB=AH2(f0.b,f1.b);
+  AH2 hR=AH2(h0.r,h1.r);
+  AH2 hG=AH2(h0.g,h1.g);
+  AH2 hB=AH2(h0.b,h1.b);
+  // Run optional input transform.
+  FsrRcasInputHx2(bR,bG,bB);
+  FsrRcasInputHx2(dR,dG,dB);
+  FsrRcasInputHx2(eR,eG,eB);
+  FsrRcasInputHx2(fR,fG,fB);
+  FsrRcasInputHx2(hR,hG,hB);
+  // Luma times 2.
+  AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG);
+  AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG);
+  AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG);
+  AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG);
+  AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG);
+  // Noise detection.
+  AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL;
+  nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL)));
+  nz=AH2_(-0.5)*nz+AH2_(1.0);
+  // Min and max of ring.
+  AH2 mn4R=min(AMin3H2(bR,dR,fR),hR);
+  AH2 mn4G=min(AMin3H2(bG,dG,fG),hG);
+  AH2 mn4B=min(AMin3H2(bB,dB,fB),hB);
+  AH2 mx4R=max(AMax3H2(bR,dR,fR),hR);
+  AH2 mx4G=max(AMax3H2(bG,dG,fG),hG);
+  AH2 mx4B=max(AMax3H2(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AH2 peakC=AH2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R);
+  AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G);
+  AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B);
+  AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
+  AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
+  AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
+  AH2 lobeR=max(-hitMinR,hitMaxR);
+  AH2 lobeG=max(-hitMinG,hitMaxG);
+  AH2 lobeB=max(-hitMinB,hitMaxB);
+  AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x);
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                          FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts.
+// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel.
+// The 'Lfga*()' functions provide a convenient way to introduce grain.
+// These functions limit grain based on distance to signal limits.
+// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality.
+// Grain application should be done in a linear colorspace.
+// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased).
+//------------------------------------------------------------------------------------------------------------------------------
+// Usage,
+//   FsrLfga*(
+//    color, // In/out linear colorspace color {0 to 1} ranged.
+//    grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain.
+//    amount); // Amount of grain (0 to 1} ranged.
+//------------------------------------------------------------------------------------------------------------------------------
+// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)'
+//==============================================================================================================================
+#if defined(A_GPU)
+ // Maximum grain is the minimum distance to the signal limit.
+ void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ // Half precision version (slower).
+ void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Packed half precision version (faster).
+ void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){
+  cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                          FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear.
+// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering.
+//------------------------------------------------------------------------------------------------------------------------------
+// Reversible tonemapper usage,
+//  FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}.
+//  FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}.
+//==============================================================================================================================
+#if defined(A_GPU)
+ void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));}
+ // The extra max solves the c=1.0 case (which is a /0).
+ void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));}
+ void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
+  AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;}
+ void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
+  AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                       FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
+// Gamma 2.0 is used so that the conversion back to linear is just to square the color.
+// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively.
+// Given good non-biased temporal blue noise as dither input,
+// the output dither will temporally conserve energy.
+// This is done by choosing the linear nearest step point instead of perceptual nearest.
+// See code below for details.
+//------------------------------------------------------------------------------------------------------------------------------
+// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION
+// ===============================================
+// - Output is 'uint(floor(saturate(n)*255.0+0.5))'.
+// - Thus rounding is to nearest.
+// - NaN gets converted to zero.
+// - INF is clamped to {0.0 to 1.0}.
+//==============================================================================================================================
+#if defined(A_GPU)
+ // Hand tuned integer position to dither value, with more values than simple checkerboard.
+ // Only 32-bit has enough precision for this compddation.
+ // Output is {0 to <1}.
+ AF1 FsrTepdDitF(AU2 p,AU1 f){
+  AF1 x=AF1_(p.x+f);
+  AF1 y=AF1_(p.y);
+  // The 1.61803 golden ratio.
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  // Number designed to provide a good visual pattern.
+  AF1 b=AF1_(1.0/3.69);
+  x=x*a+(y*b);
+  return AFractF1(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This version is 8-bit gamma 2.0.
+ // The 'c' input is {0 to 1}.
+ // Output is {0 to 1} ready for image store.
+ void FsrTepdC8F(inout AF3 c,AF1 dit){
+  AF3 n=sqrt(c);
+  n=floor(n*AF3_(255.0))*AF3_(1.0/255.0);
+  AF3 a=n*n;
+  AF3 b=n+AF3_(1.0/255.0);b=b*b;
+  // Ratio of 'a' to 'b' required to produce 'c'.
+  // APrxLoRcpF1() won't work here (at least for very high dynamic ranges).
+  // APrxMedRcpF1() is an IADD,FMA,MUL.
+  AF3 r=(c-b)*APrxMedRcpF3(a-b);
+  // Use the ratio as a cutoff to choose 'a' or 'b'.
+  // AGtZeroF1() is a MUL.
+  c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This version is 10-bit gamma 2.0.
+ // The 'c' input is {0 to 1}.
+ // Output is {0 to 1} ready for image store.
+ void FsrTepdC10F(inout AF3 c,AF1 dit){
+  AF3 n=sqrt(c);
+  n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0);
+  AF3 a=n*n;
+  AF3 b=n+AF3_(1.0/1023.0);b=b*b;
+  AF3 r=(c-b)*APrxMedRcpF3(a-b);
+  c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ AH1 FsrTepdDitH(AU2 p,AU1 f){
+  AF1 x=AF1_(p.x+f);
+  AF1 y=AF1_(p.y);
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  AF1 b=AF1_(1.0/3.69);
+  x=x*a+(y*b);
+  return AH1(AFractF1(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC8H(inout AH3 c,AH1 dit){
+  AH3 n=sqrt(c);
+  n=floor(n*AH3_(255.0))*AH3_(1.0/255.0);
+  AH3 a=n*n;
+  AH3 b=n+AH3_(1.0/255.0);b=b*b;
+  AH3 r=(c-b)*APrxMedRcpH3(a-b);
+  c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC10H(inout AH3 c,AH1 dit){
+  AH3 n=sqrt(c);
+  n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0);
+  AH3 a=n*n;
+  AH3 b=n+AH3_(1.0/1023.0);b=b*b;
+  AH3 r=(c-b)*APrxMedRcpH3(a-b);
+  c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));}
+//==============================================================================================================================
+ // This computes dither for positions 'p' and 'p+{8,0}'.
+ AH2 FsrTepdDitHx2(AU2 p,AU1 f){
+  AF2 x;
+  x.x=AF1_(p.x+f);
+  x.y=x.x+AF1_(8.0);
+  AF1 y=AF1_(p.y);
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  AF1 b=AF1_(1.0/3.69);
+  x=x*AF2_(a)+AF2_(y*b);
+  return AH2(AFractF2(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
+  AH2 nR=sqrt(cR);
+  AH2 nG=sqrt(cG);
+  AH2 nB=sqrt(cB);
+  nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0);
+  nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0);
+  nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0);
+  AH2 aR=nR*nR;
+  AH2 aG=nG*nG;
+  AH2 aB=nB*nB;
+  AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR;
+  AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG;
+  AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB;
+  AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
+  AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
+  AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
+  cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0));
+  cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0));
+  cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
+  AH2 nR=sqrt(cR);
+  AH2 nG=sqrt(cG);
+  AH2 nB=sqrt(cB);
+  nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0);
+  nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0);
+  nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0);
+  AH2 aR=nR*nR;
+  AH2 aG=nG*nG;
+  AH2 aB=nB*nB;
+  AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR;
+  AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG;
+  AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB;
+  AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
+  AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
+  AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
+  cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0));
+  cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0));
+  cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));}
+#endif
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl
new file mode 100644
index 0000000000..8e8755db20
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl
@@ -0,0 +1,88 @@
+#version 430 core
+precision mediump float;
+layout (local_size_x = 64) in;
+layout(rgba8, binding = 0, location=0) uniform image2D imgOutput;
+layout( location=1 ) uniform sampler2D Source;
+layout( location=2 ) uniform float srcX0;
+layout( location=3 ) uniform float srcX1;
+layout( location=4 ) uniform float srcY0;
+layout( location=5 ) uniform float srcY1;
+layout( location=6 ) uniform float dstX0;
+layout( location=7 ) uniform float dstX1;
+layout( location=8 ) uniform float dstY0;
+layout( location=9 ) uniform float dstY1;
+layout( location=10 ) uniform float scaleX;
+layout( location=11 ) uniform float scaleY;
+
+#define A_GPU 1
+#define A_GLSL 1
+#include "ffx_a.h"
+
+#define FSR_EASU_F 1
+AU4 con0, con1, con2, con3;
+float srcW, srcH, dstW, dstH;
+vec2 bLeft, tRight;
+
+AF2 translate(AF2 pos) {
+    return AF2(pos.x * scaleX, pos.y * scaleY);
+}
+
+void setBounds(vec2 bottomLeft, vec2 topRight) {
+    bLeft = bottomLeft;
+    tRight = topRight;
+}
+
+AF2 translateDest(AF2 pos) {
+    AF2 translatedPos = AF2(pos.x, pos.y);
+    translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x;
+    translatedPos.y = dstY0 > dstY1 ? dstY0 + dstY1 - translatedPos.y - 1: translatedPos.y;
+    return translatedPos;
+}
+
+AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(Source, translate(p), 0); return res; }
+AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(Source, translate(p), 1); return res; }
+AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(Source, translate(p), 2); return res; }
+
+#include "ffx_fsr1.h"
+
+float insideBox(vec2 v) {
+    vec2 s = step(bLeft, v) - step(tRight, v);
+    return s.x * s.y;   
+}
+
+void CurrFilter(AU2 pos)
+{
+    if((insideBox(vec2(pos.x, pos.y))) == 0) {
+        imageStore(imgOutput, ASU2(pos.x, pos.y), AF4(0,0,0,1));
+       return;
+    }
+    AF3 c;
+    FsrEasuF(c, AU2(pos.x - bLeft.x, pos.y - bLeft.y), con0, con1, con2, con3);
+    imageStore(imgOutput, ASU2(translateDest(pos)), AF4(c, 1));
+}
+
+void main() {
+    srcW = abs(srcX1 - srcX0);
+    srcH = abs(srcY1 - srcY0);
+    dstW = abs(dstX1 - dstX0);
+    dstH = abs(dstY1 - dstY0);
+
+    AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
+
+    setBounds(vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1),
+        vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0));
+
+    // Upscaling
+    FsrEasuCon(con0, con1, con2, con3,
+        srcW, srcH,  // Viewport size (top left aligned) in the input image which is to be scaled.
+        srcW, srcH,  // The size of the input image.
+        dstW, dstH); // The output resolution.
+
+    CurrFilter(gxy);
+    gxy.x += 8u;
+    CurrFilter(gxy);
+    gxy.y += 8u;
+    CurrFilter(gxy);
+    gxy.x -= 8u;
+    CurrFilter(gxy);
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl
new file mode 100644
index 0000000000..d3b98729a9
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl
@@ -0,0 +1,37 @@
+#version 430 core
+precision mediump float;
+layout (local_size_x = 64) in;
+layout(rgba8, binding = 0, location=0) uniform image2D imgOutput;
+layout( location=1 ) uniform sampler2D source;
+layout( location=2 ) uniform float sharpening;
+
+#define A_GPU 1
+#define A_GLSL 1
+#include "ffx_a.h"
+
+#define FSR_RCAS_F 1
+AU4 con0;
+
+AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); }
+void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {}
+
+#include "ffx_fsr1.h"
+
+void CurrFilter(AU2 pos)
+{
+    AF3 c;
+    FsrRcasF(c.r, c.g, c.b, pos, con0);
+    imageStore(imgOutput, ASU2(pos), AF4(c, 1));
+}
+
+void main() {
+    FsrRcasCon(con0, sharpening);
+    AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
+    CurrFilter(gxy);
+    gxy.x += 8u;
+    CurrFilter(gxy);
+    gxy.y += 8u;
+    CurrFilter(gxy);
+    gxy.x -= 8u;
+    CurrFilter(gxy);
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl
new file mode 100644
index 0000000000..8bdcbca693
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl
@@ -0,0 +1,1174 @@
+/*============================================================================
+
+
+                    NVIDIA FXAA 3.11 by TIMOTHY LOTTES
+
+
+------------------------------------------------------------------------------
+COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED.
+------------------------------------------------------------------------------
+TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED
+*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA
+OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR
+CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR
+LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION,
+OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE
+THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+------------------------------------------------------------------------------
+                           INTEGRATION CHECKLIST
+------------------------------------------------------------------------------
+(1.)
+In the shader source, setup defines for the desired configuration.
+When providing multiple shaders (for different presets),
+simply setup the defines differently in multiple files.
+Example,
+
+  #define FXAA_PC 1
+  #define FXAA_HLSL_5 1
+  #define FXAA_QUALITY__PRESET 12
+
+Or,
+
+  #define FXAA_360 1
+  
+Or,
+
+  #define FXAA_PS3 1
+  
+Etc.
+
+(2.)
+Then include this file,
+
+  #include "Fxaa3_11.h"
+
+(3.)
+Then call the FXAA pixel shader from within your desired shader.
+Look at the FXAA Quality FxaaPixelShader() for docs on inputs.
+As for FXAA 3.11 all inputs for all shaders are the same 
+to enable easy porting between platforms.
+
+  return FxaaPixelShader(...);
+
+(4.)
+Insure pass prior to FXAA outputs RGBL (see next section).
+Or use,
+
+  #define FXAA_GREEN_AS_LUMA 1
+
+(5.)
+Setup engine to provide the following constants
+which are used in the FxaaPixelShader() inputs,
+
+  FxaaFloat2 fxaaQualityRcpFrame,
+  FxaaFloat4 fxaaConsoleRcpFrameOpt,
+  FxaaFloat4 fxaaConsoleRcpFrameOpt2,
+  FxaaFloat4 fxaaConsole360RcpFrameOpt2,
+  FxaaFloat fxaaQualitySubpix,
+  FxaaFloat fxaaQualityEdgeThreshold,
+  FxaaFloat fxaaQualityEdgeThresholdMin,
+  FxaaFloat fxaaConsoleEdgeSharpness,
+  FxaaFloat fxaaConsoleEdgeThreshold,
+  FxaaFloat fxaaConsoleEdgeThresholdMin,
+  FxaaFloat4 fxaaConsole360ConstDir
+
+Look at the FXAA Quality FxaaPixelShader() for docs on inputs.
+
+(6.)
+Have FXAA vertex shader run as a full screen triangle,
+and output "pos" and "fxaaConsolePosPos" 
+such that inputs in the pixel shader provide,
+
+  // {xy} = center of pixel
+  FxaaFloat2 pos,
+
+  // {xy__} = upper left of pixel
+  // {__zw} = lower right of pixel
+  FxaaFloat4 fxaaConsolePosPos,
+
+(7.)
+Insure the texture sampler(s) used by FXAA are set to bilinear filtering.
+
+
+------------------------------------------------------------------------------
+                    INTEGRATION - RGBL AND COLORSPACE
+------------------------------------------------------------------------------
+FXAA3 requires RGBL as input unless the following is set, 
+
+  #define FXAA_GREEN_AS_LUMA 1
+
+In which case the engine uses green in place of luma,
+and requires RGB input is in a non-linear colorspace.
+
+RGB should be LDR (low dynamic range).
+Specifically do FXAA after tonemapping.
+
+RGB data as returned by a texture fetch can be non-linear,
+or linear when FXAA_GREEN_AS_LUMA is not set.
+Note an "sRGB format" texture counts as linear,
+because the result of a texture fetch is linear data.
+Regular "RGBA8" textures in the sRGB colorspace are non-linear.
+
+If FXAA_GREEN_AS_LUMA is not set,
+luma must be stored in the alpha channel prior to running FXAA.
+This luma should be in a perceptual space (could be gamma 2.0).
+Example pass before FXAA where output is gamma 2.0 encoded,
+
+  color.rgb = ToneMap(color.rgb); // linear color output
+  color.rgb = sqrt(color.rgb);    // gamma 2.0 color output
+  return color;
+
+To use FXAA,
+
+  color.rgb = ToneMap(color.rgb);  // linear color output
+  color.rgb = sqrt(color.rgb);     // gamma 2.0 color output
+  color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma
+  return color;
+
+Another example where output is linear encoded,
+say for instance writing to an sRGB formated render target,
+where the render target does the conversion back to sRGB after blending,
+
+  color.rgb = ToneMap(color.rgb); // linear color output
+  return color;
+
+To use FXAA,
+
+  color.rgb = ToneMap(color.rgb); // linear color output
+  color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma
+  return color;
+
+Getting luma correct is required for the algorithm to work correctly.
+
+
+------------------------------------------------------------------------------
+                          BEING LINEARLY CORRECT?
+------------------------------------------------------------------------------
+Applying FXAA to a framebuffer with linear RGB color will look worse.
+This is very counter intuitive, but happends to be true in this case.
+The reason is because dithering artifacts will be more visiable 
+in a linear colorspace.
+
+
+------------------------------------------------------------------------------
+                             COMPLEX INTEGRATION
+------------------------------------------------------------------------------
+Q. What if the engine is blending into RGB before wanting to run FXAA?
+
+A. In the last opaque pass prior to FXAA,
+   have the pass write out luma into alpha.
+   Then blend into RGB only.
+   FXAA should be able to run ok
+   assuming the blending pass did not any add aliasing.
+   This should be the common case for particles and common blending passes.
+
+A. Or use FXAA_GREEN_AS_LUMA.
+
+============================================================================*/
+
+#version 430 core
+
+layout(local_size_x = 16, local_size_y = 16) in;
+layout(rgba8, binding = 0) uniform image2D imgOutput;
+
+uniform sampler2D inputTexture;
+layout(location=0) uniform vec2 invResolution;
+
+#define FXAA_QUALITY__PRESET 12
+#define FXAA_GREEN_AS_LUMA 1
+#define FXAA_PC 1
+#define FXAA_GLSL_130 1
+
+
+/*============================================================================
+
+                             INTEGRATION KNOBS
+
+/*==========================================================================*/
+#ifndef FXAA_PC
+    //
+    // FXAA Quality
+    // The high quality PC algorithm.
+    //
+    #define FXAA_PC 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_GLSL_120
+    #define FXAA_GLSL_120 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_GLSL_130
+    #define FXAA_GLSL_130 0
+#endif
+/*==========================================================================*/
+#ifndef FXAA_GREEN_AS_LUMA
+    //
+    // For those using non-linear color,
+    // and either not able to get luma in alpha, or not wanting to,
+    // this enables FXAA to run using green as a proxy for luma.
+    // So with this enabled, no need to pack luma in alpha.
+    //
+    // This will turn off AA on anything which lacks some amount of green.
+    // Pure red and blue or combination of only R and B, will get no AA.
+    //
+    // Might want to lower the settings for both,
+    //    fxaaConsoleEdgeThresholdMin
+    //    fxaaQualityEdgeThresholdMin
+    // In order to insure AA does not get turned off on colors 
+    // which contain a minor amount of green.
+    //
+    // 1 = On.
+    // 0 = Off.
+    //
+    #define FXAA_GREEN_AS_LUMA 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_EARLY_EXIT
+    //
+    // Controls algorithm's early exit path.
+    // On PS3 turning this ON adds 2 cycles to the shader.
+    // On 360 turning this OFF adds 10ths of a millisecond to the shader.
+    // Turning this off on console will result in a more blurry image.
+    // So this defaults to on.
+    //
+    // 1 = On.
+    // 0 = Off.
+    //
+    #define FXAA_EARLY_EXIT 1
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_DISCARD
+    //
+    // Only valid for PC OpenGL currently.
+    // Probably will not work when FXAA_GREEN_AS_LUMA = 1.
+    //
+    // 1 = Use discard on pixels which don't need AA.
+    //     For APIs which enable concurrent TEX+ROP from same surface.
+    // 0 = Return unchanged color on pixels which don't need AA.
+    //
+    #define FXAA_DISCARD 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_FAST_PIXEL_OFFSET
+    //
+    // Used for GLSL 120 only.
+    //
+    // 1 = GL API supports fast pixel offsets
+    // 0 = do not use fast pixel offsets
+    //
+    #ifdef GL_EXT_gpu_shader4
+        #define FXAA_FAST_PIXEL_OFFSET 1
+    #endif
+    #ifdef GL_NV_gpu_shader5
+        #define FXAA_FAST_PIXEL_OFFSET 1
+    #endif
+    #ifdef GL_ARB_gpu_shader5
+        #define FXAA_FAST_PIXEL_OFFSET 1
+    #endif
+    #ifndef FXAA_FAST_PIXEL_OFFSET
+        #define FXAA_FAST_PIXEL_OFFSET 0
+    #endif
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_GATHER4_ALPHA
+    //
+    // 1 = API supports gather4 on alpha channel.
+    // 0 = API does not support gather4 on alpha channel.
+    //
+    #if (FXAA_HLSL_5 == 1)
+        #define FXAA_GATHER4_ALPHA 1
+    #endif
+    #ifdef GL_ARB_gpu_shader5
+        #define FXAA_GATHER4_ALPHA 1
+    #endif
+    #ifdef GL_NV_gpu_shader5
+        #define FXAA_GATHER4_ALPHA 1
+    #endif
+    #ifndef FXAA_GATHER4_ALPHA
+        #define FXAA_GATHER4_ALPHA 0
+    #endif
+#endif
+
+/*============================================================================
+                        FXAA QUALITY - TUNING KNOBS
+------------------------------------------------------------------------------
+NOTE the other tuning knobs are now in the shader function inputs!
+============================================================================*/
+#ifndef FXAA_QUALITY__PRESET
+    //
+    // Choose the quality preset.
+    // This needs to be compiled into the shader as it effects code.
+    // Best option to include multiple presets is to 
+    // in each shader define the preset, then include this file.
+    // 
+    // OPTIONS
+    // -----------------------------------------------------------------------
+    // 10 to 15 - default medium dither (10=fastest, 15=highest quality)
+    // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality)
+    // 39       - no dither, very expensive 
+    //
+    // NOTES
+    // -----------------------------------------------------------------------
+    // 12 = slightly faster then FXAA 3.9 and higher edge quality (default)
+    // 13 = about same speed as FXAA 3.9 and better than 12
+    // 23 = closest to FXAA 3.9 visually and performance wise
+    //  _ = the lowest digit is directly related to performance
+    // _  = the highest digit is directly related to style
+    // 
+    #define FXAA_QUALITY__PRESET 12
+#endif
+
+
+/*============================================================================
+
+                           FXAA QUALITY - PRESETS
+
+============================================================================*/
+
+/*============================================================================
+                     FXAA QUALITY - MEDIUM DITHER PRESETS
+============================================================================*/
+#if (FXAA_QUALITY__PRESET == 10)
+    #define FXAA_QUALITY__PS 3
+    #define FXAA_QUALITY__P0 1.5
+    #define FXAA_QUALITY__P1 3.0
+    #define FXAA_QUALITY__P2 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 11)
+    #define FXAA_QUALITY__PS 4
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 3.0
+    #define FXAA_QUALITY__P3 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 12)
+    #define FXAA_QUALITY__PS 5
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 4.0
+    #define FXAA_QUALITY__P4 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 13)
+    #define FXAA_QUALITY__PS 6
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 4.0
+    #define FXAA_QUALITY__P5 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 14)
+    #define FXAA_QUALITY__PS 7
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 4.0
+    #define FXAA_QUALITY__P6 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 15)
+    #define FXAA_QUALITY__PS 8
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 2.0
+    #define FXAA_QUALITY__P6 4.0
+    #define FXAA_QUALITY__P7 12.0
+#endif
+
+/*============================================================================
+                     FXAA QUALITY - LOW DITHER PRESETS
+============================================================================*/
+#if (FXAA_QUALITY__PRESET == 20)
+    #define FXAA_QUALITY__PS 3
+    #define FXAA_QUALITY__P0 1.5
+    #define FXAA_QUALITY__P1 2.0
+    #define FXAA_QUALITY__P2 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 21)
+    #define FXAA_QUALITY__PS 4
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 22)
+    #define FXAA_QUALITY__PS 5
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 23)
+    #define FXAA_QUALITY__PS 6
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 24)
+    #define FXAA_QUALITY__PS 7
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 3.0
+    #define FXAA_QUALITY__P6 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 25)
+    #define FXAA_QUALITY__PS 8
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 2.0
+    #define FXAA_QUALITY__P6 4.0
+    #define FXAA_QUALITY__P7 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 26)
+    #define FXAA_QUALITY__PS 9
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 2.0
+    #define FXAA_QUALITY__P6 2.0
+    #define FXAA_QUALITY__P7 4.0
+    #define FXAA_QUALITY__P8 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 27)
+    #define FXAA_QUALITY__PS 10
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 2.0
+    #define FXAA_QUALITY__P6 2.0
+    #define FXAA_QUALITY__P7 2.0
+    #define FXAA_QUALITY__P8 4.0
+    #define FXAA_QUALITY__P9 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 28)
+    #define FXAA_QUALITY__PS 11
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 2.0
+    #define FXAA_QUALITY__P6 2.0
+    #define FXAA_QUALITY__P7 2.0
+    #define FXAA_QUALITY__P8 2.0
+    #define FXAA_QUALITY__P9 4.0
+    #define FXAA_QUALITY__P10 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY__PRESET == 29)
+    #define FXAA_QUALITY__PS 12
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.5
+    #define FXAA_QUALITY__P2 2.0
+    #define FXAA_QUALITY__P3 2.0
+    #define FXAA_QUALITY__P4 2.0
+    #define FXAA_QUALITY__P5 2.0
+    #define FXAA_QUALITY__P6 2.0
+    #define FXAA_QUALITY__P7 2.0
+    #define FXAA_QUALITY__P8 2.0
+    #define FXAA_QUALITY__P9 2.0
+    #define FXAA_QUALITY__P10 4.0
+    #define FXAA_QUALITY__P11 8.0
+#endif
+
+/*============================================================================
+                     FXAA QUALITY - EXTREME QUALITY
+============================================================================*/
+#if (FXAA_QUALITY__PRESET == 39)
+    #define FXAA_QUALITY__PS 12
+    #define FXAA_QUALITY__P0 1.0
+    #define FXAA_QUALITY__P1 1.0
+    #define FXAA_QUALITY__P2 1.0
+    #define FXAA_QUALITY__P3 1.0
+    #define FXAA_QUALITY__P4 1.0
+    #define FXAA_QUALITY__P5 1.5
+    #define FXAA_QUALITY__P6 2.0
+    #define FXAA_QUALITY__P7 2.0
+    #define FXAA_QUALITY__P8 2.0
+    #define FXAA_QUALITY__P9 2.0
+    #define FXAA_QUALITY__P10 4.0
+    #define FXAA_QUALITY__P11 8.0
+#endif
+
+
+
+/*============================================================================
+
+                                API PORTING
+
+============================================================================*/
+#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1)
+    #define FxaaBool bool
+    #define FxaaDiscard discard
+    #define FxaaFloat float
+    #define FxaaFloat2 vec2
+    #define FxaaFloat3 vec3
+    #define FxaaFloat4 vec4
+    #define FxaaHalf float
+    #define FxaaHalf2 vec2
+    #define FxaaHalf3 vec3
+    #define FxaaHalf4 vec4
+    #define FxaaInt2 ivec2
+    #define FxaaSat(x) clamp(x, 0.0, 1.0)
+    #define FxaaTex sampler2D
+#else
+    #define FxaaBool bool
+    #define FxaaDiscard clip(-1)
+    #define FxaaFloat float
+    #define FxaaFloat2 float2
+    #define FxaaFloat3 float3
+    #define FxaaFloat4 float4
+    #define FxaaHalf half
+    #define FxaaHalf2 half2
+    #define FxaaHalf3 half3
+    #define FxaaHalf4 half4
+    #define FxaaSat(x) saturate(x)
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_GLSL_120 == 1)
+    // Requires,
+    //  #version 120
+    // And at least,
+    //  #extension GL_EXT_gpu_shader4 : enable
+    //  (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9)
+    #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0)
+    #if (FXAA_FAST_PIXEL_OFFSET == 1)
+        #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o)
+    #else
+        #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0)
+    #endif
+    #if (FXAA_GATHER4_ALPHA == 1)
+        // use #extension GL_ARB_gpu_shader5 : enable
+        #define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
+        #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
+        #define FxaaTexGreen4(t, p) textureGather(t, p, 1)
+        #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1)
+    #endif
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_GLSL_130 == 1)
+    // Requires "#version 130" or better
+    #define FxaaTexTop(t, p) textureLod(t, p, 0.0)
+    #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o)
+    #if (FXAA_GATHER4_ALPHA == 1)
+        // use #extension GL_ARB_gpu_shader5 : enable
+        #define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
+        #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
+        #define FxaaTexGreen4(t, p) textureGather(t, p, 1)
+        #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1)
+    #endif
+#endif
+
+
+/*============================================================================
+                   GREEN AS LUMA OPTION SUPPORT FUNCTION
+============================================================================*/
+#if (FXAA_GREEN_AS_LUMA == 0)
+    FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; }
+#else
+    FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; }
+#endif    
+
+
+
+
+/*============================================================================
+
+                             FXAA3 QUALITY - PC
+
+============================================================================*/
+#if (FXAA_PC == 1)
+/*--------------------------------------------------------------------------*/
+FxaaFloat4 FxaaPixelShader(
+    //
+    // Use noperspective interpolation here (turn off perspective interpolation).
+    // {xy} = center of pixel
+    FxaaFloat2 pos,
+    //
+    // Used only for FXAA Console, and not used on the 360 version.
+    // Use noperspective interpolation here (turn off perspective interpolation).
+    // {xy__} = upper left of pixel
+    // {__zw} = lower right of pixel
+    FxaaFloat4 fxaaConsolePosPos,
+    //
+    // Input color texture.
+    // {rgb_} = color in linear or perceptual color space
+    // if (FXAA_GREEN_AS_LUMA == 0)
+    //     {___a} = luma in perceptual color space (not linear)
+    FxaaTex tex,
+    //
+    // Only used on the optimized 360 version of FXAA Console.
+    // For everything but 360, just use the same input here as for "tex".
+    // For 360, same texture, just alias with a 2nd sampler.
+    // This sampler needs to have an exponent bias of -1.
+    FxaaTex fxaaConsole360TexExpBiasNegOne,
+    //
+    // Only used on the optimized 360 version of FXAA Console.
+    // For everything but 360, just use the same input here as for "tex".
+    // For 360, same texture, just alias with a 3nd sampler.
+    // This sampler needs to have an exponent bias of -2.
+    FxaaTex fxaaConsole360TexExpBiasNegTwo,
+    //
+    // Only used on FXAA Quality.
+    // This must be from a constant/uniform.
+    // {x_} = 1.0/screenWidthInPixels
+    // {_y} = 1.0/screenHeightInPixels
+    FxaaFloat2 fxaaQualityRcpFrame,
+    //
+    // Only used on FXAA Console.
+    // This must be from a constant/uniform.
+    // This effects sub-pixel AA quality and inversely sharpness.
+    //   Where N ranges between,
+    //     N = 0.50 (default)
+    //     N = 0.33 (sharper)
+    // {x___} = -N/screenWidthInPixels  
+    // {_y__} = -N/screenHeightInPixels
+    // {__z_} =  N/screenWidthInPixels  
+    // {___w} =  N/screenHeightInPixels 
+    FxaaFloat4 fxaaConsoleRcpFrameOpt,
+    //
+    // Only used on FXAA Console.
+    // Not used on 360, but used on PS3 and PC.
+    // This must be from a constant/uniform.
+    // {x___} = -2.0/screenWidthInPixels  
+    // {_y__} = -2.0/screenHeightInPixels
+    // {__z_} =  2.0/screenWidthInPixels  
+    // {___w} =  2.0/screenHeightInPixels 
+    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
+    //
+    // Only used on FXAA Console.
+    // Only used on 360 in place of fxaaConsoleRcpFrameOpt2.
+    // This must be from a constant/uniform.
+    // {x___} =  8.0/screenWidthInPixels  
+    // {_y__} =  8.0/screenHeightInPixels
+    // {__z_} = -4.0/screenWidthInPixels  
+    // {___w} = -4.0/screenHeightInPixels 
+    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
+    //
+    // Only used on FXAA Quality.
+    // This used to be the FXAA_QUALITY__SUBPIX define.
+    // It is here now to allow easier tuning.
+    // Choose the amount of sub-pixel aliasing removal.
+    // This can effect sharpness.
+    //   1.00 - upper limit (softer)
+    //   0.75 - default amount of filtering
+    //   0.50 - lower limit (sharper, less sub-pixel aliasing removal)
+    //   0.25 - almost off
+    //   0.00 - completely off
+    FxaaFloat fxaaQualitySubpix,
+    //
+    // Only used on FXAA Quality.
+    // This used to be the FXAA_QUALITY__EDGE_THRESHOLD define.
+    // It is here now to allow easier tuning.
+    // The minimum amount of local contrast required to apply algorithm.
+    //   0.333 - too little (faster)
+    //   0.250 - low quality
+    //   0.166 - default
+    //   0.125 - high quality 
+    //   0.063 - overkill (slower)
+    FxaaFloat fxaaQualityEdgeThreshold,
+    //
+    // Only used on FXAA Quality.
+    // This used to be the FXAA_QUALITY__EDGE_THRESHOLD_MIN define.
+    // It is here now to allow easier tuning.
+    // Trims the algorithm from processing darks.
+    //   0.0833 - upper limit (default, the start of visible unfiltered edges)
+    //   0.0625 - high quality (faster)
+    //   0.0312 - visible limit (slower)
+    // Special notes when using FXAA_GREEN_AS_LUMA,
+    //   Likely want to set this to zero.
+    //   As colors that are mostly not-green
+    //   will appear very dark in the green channel!
+    //   Tune by looking at mostly non-green content,
+    //   then start at zero and increase until aliasing is a problem.
+    FxaaFloat fxaaQualityEdgeThresholdMin,
+    // 
+    // Only used on FXAA Console.
+    // This used to be the FXAA_CONSOLE__EDGE_SHARPNESS define.
+    // It is here now to allow easier tuning.
+    // This does not effect PS3, as this needs to be compiled in.
+    //   Use FXAA_CONSOLE__PS3_EDGE_SHARPNESS for PS3.
+    //   Due to the PS3 being ALU bound,
+    //   there are only three safe values here: 2 and 4 and 8.
+    //   These options use the shaders ability to a free *|/ by 2|4|8.
+    // For all other platforms can be a non-power of two.
+    //   8.0 is sharper (default!!!)
+    //   4.0 is softer
+    //   2.0 is really soft (good only for vector graphics inputs)
+    FxaaFloat fxaaConsoleEdgeSharpness,
+    //
+    // Only used on FXAA Console.
+    // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD define.
+    // It is here now to allow easier tuning.
+    // This does not effect PS3, as this needs to be compiled in.
+    //   Use FXAA_CONSOLE__PS3_EDGE_THRESHOLD for PS3.
+    //   Due to the PS3 being ALU bound,
+    //   there are only two safe values here: 1/4 and 1/8.
+    //   These options use the shaders ability to a free *|/ by 2|4|8.
+    // The console setting has a different mapping than the quality setting.
+    // Other platforms can use other values.
+    //   0.125 leaves less aliasing, but is softer (default!!!)
+    //   0.25 leaves more aliasing, and is sharper
+    FxaaFloat fxaaConsoleEdgeThreshold,
+    //
+    // Only used on FXAA Console.
+    // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD_MIN define.
+    // It is here now to allow easier tuning.
+    // Trims the algorithm from processing darks.
+    // The console setting has a different mapping than the quality setting.
+    // This only applies when FXAA_EARLY_EXIT is 1.
+    // This does not apply to PS3, 
+    // PS3 was simplified to avoid more shader instructions.
+    //   0.06 - faster but more aliasing in darks
+    //   0.05 - default
+    //   0.04 - slower and less aliasing in darks
+    // Special notes when using FXAA_GREEN_AS_LUMA,
+    //   Likely want to set this to zero.
+    //   As colors that are mostly not-green
+    //   will appear very dark in the green channel!
+    //   Tune by looking at mostly non-green content,
+    //   then start at zero and increase until aliasing is a problem.
+    FxaaFloat fxaaConsoleEdgeThresholdMin,
+    //    
+    // Extra constants for 360 FXAA Console only.
+    // Use zeros or anything else for other platforms.
+    // These must be in physical constant registers and NOT immedates.
+    // Immedates will result in compiler un-optimizing.
+    // {xyzw} = float4(1.0, -1.0, 0.25, -0.25)
+    FxaaFloat4 fxaaConsole360ConstDir
+) {
+/*--------------------------------------------------------------------------*/
+    FxaaFloat2 posM;
+    posM.x = pos.x;
+    posM.y = pos.y;
+    #if (FXAA_GATHER4_ALPHA == 1)
+        #if (FXAA_DISCARD == 0)
+            FxaaFloat4 rgbyM = FxaaTexTop(tex, posM);
+            #if (FXAA_GREEN_AS_LUMA == 0)
+                #define lumaM rgbyM.w
+            #else
+                #define lumaM rgbyM.y
+            #endif
+        #endif
+        #if (FXAA_GREEN_AS_LUMA == 0)
+            FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM);
+            FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1));
+        #else
+            FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM);
+            FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1));
+        #endif
+        #if (FXAA_DISCARD == 1)
+            #define lumaM luma4A.w
+        #endif
+        #define lumaE luma4A.z
+        #define lumaS luma4A.x
+        #define lumaSE luma4A.y
+        #define lumaNW luma4B.w
+        #define lumaN luma4B.z
+        #define lumaW luma4B.x
+    #else
+        FxaaFloat4 rgbyM = FxaaTexTop(tex, posM);
+        #if (FXAA_GREEN_AS_LUMA == 0)
+            #define lumaM rgbyM.w
+        #else
+            #define lumaM rgbyM.y
+        #endif
+        FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy));
+    #endif
+/*--------------------------------------------------------------------------*/
+    FxaaFloat maxSM = max(lumaS, lumaM);
+    FxaaFloat minSM = min(lumaS, lumaM);
+    FxaaFloat maxESM = max(lumaE, maxSM);
+    FxaaFloat minESM = min(lumaE, minSM);
+    FxaaFloat maxWN = max(lumaN, lumaW);
+    FxaaFloat minWN = min(lumaN, lumaW);
+    FxaaFloat rangeMax = max(maxWN, maxESM);
+    FxaaFloat rangeMin = min(minWN, minESM);
+    FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold;
+    FxaaFloat range = rangeMax - rangeMin;
+    FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled);
+    FxaaBool earlyExit = range < rangeMaxClamped;
+/*--------------------------------------------------------------------------*/
+    if(earlyExit)
+        #if (FXAA_DISCARD == 1)
+            FxaaDiscard;
+        #else
+            return rgbyM;
+        #endif
+/*--------------------------------------------------------------------------*/
+    #if (FXAA_GATHER4_ALPHA == 0)
+        FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy));
+    #else
+        FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy));
+    #endif
+/*--------------------------------------------------------------------------*/
+    FxaaFloat lumaNS = lumaN + lumaS;
+    FxaaFloat lumaWE = lumaW + lumaE;
+    FxaaFloat subpixRcpRange = 1.0/range;
+    FxaaFloat subpixNSWE = lumaNS + lumaWE;
+    FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS;
+    FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat lumaNESE = lumaNE + lumaSE;
+    FxaaFloat lumaNWNE = lumaNW + lumaNE;
+    FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE;
+    FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat lumaNWSW = lumaNW + lumaSW;
+    FxaaFloat lumaSWSE = lumaSW + lumaSE;
+    FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2);
+    FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2);
+    FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW;
+    FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE;
+    FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4;
+    FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE;
+    FxaaFloat lengthSign = fxaaQualityRcpFrame.x;
+    FxaaBool horzSpan = edgeHorz >= edgeVert;
+    FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE;
+/*--------------------------------------------------------------------------*/
+    if(!horzSpan) lumaN = lumaW;
+    if(!horzSpan) lumaS = lumaE;
+    if(horzSpan) lengthSign = fxaaQualityRcpFrame.y;
+    FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat gradientN = lumaN - lumaM;
+    FxaaFloat gradientS = lumaS - lumaM;
+    FxaaFloat lumaNN = lumaN + lumaM;
+    FxaaFloat lumaSS = lumaS + lumaM;
+    FxaaBool pairN = abs(gradientN) >= abs(gradientS);
+    FxaaFloat gradient = max(abs(gradientN), abs(gradientS));
+    if(pairN) lengthSign = -lengthSign;
+    FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange);
+/*--------------------------------------------------------------------------*/
+    FxaaFloat2 posB;
+    posB.x = posM.x;
+    posB.y = posM.y;
+    FxaaFloat2 offNP;
+    offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x;
+    offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y;
+    if(!horzSpan) posB.x += lengthSign * 0.5;
+    if( horzSpan) posB.y += lengthSign * 0.5;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat2 posN;
+    posN.x = posB.x - offNP.x * FXAA_QUALITY__P0;
+    posN.y = posB.y - offNP.y * FXAA_QUALITY__P0;
+    FxaaFloat2 posP;
+    posP.x = posB.x + offNP.x * FXAA_QUALITY__P0;
+    posP.y = posB.y + offNP.y * FXAA_QUALITY__P0;
+    FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0;
+    FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN));
+    FxaaFloat subpixE = subpixC * subpixC;
+    FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP));
+/*--------------------------------------------------------------------------*/
+    if(!pairN) lumaNN = lumaSS;
+    FxaaFloat gradientScaled = gradient * 1.0/4.0;
+    FxaaFloat lumaMM = lumaM - lumaNN * 0.5;
+    FxaaFloat subpixF = subpixD * subpixE;
+    FxaaBool lumaMLTZero = lumaMM < 0.0;
+/*--------------------------------------------------------------------------*/
+    lumaEndN -= lumaNN * 0.5;
+    lumaEndP -= lumaNN * 0.5;
+    FxaaBool doneN = abs(lumaEndN) >= gradientScaled;
+    FxaaBool doneP = abs(lumaEndP) >= gradientScaled;
+    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1;
+    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1;
+    FxaaBool doneNP = (!doneN) || (!doneP);
+    if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1;
+    if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1;
+/*--------------------------------------------------------------------------*/
+    if(doneNP) {
+        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+        doneN = abs(lumaEndN) >= gradientScaled;
+        doneP = abs(lumaEndP) >= gradientScaled;
+        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2;
+        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2;
+        doneNP = (!doneN) || (!doneP);
+        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2;
+        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2;
+/*--------------------------------------------------------------------------*/
+        #if (FXAA_QUALITY__PS > 3)
+        if(doneNP) {
+            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+            doneN = abs(lumaEndN) >= gradientScaled;
+            doneP = abs(lumaEndP) >= gradientScaled;
+            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3;
+            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3;
+            doneNP = (!doneN) || (!doneP);
+            if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3;
+            if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3;
+/*--------------------------------------------------------------------------*/
+            #if (FXAA_QUALITY__PS > 4)
+            if(doneNP) {
+                if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                doneN = abs(lumaEndN) >= gradientScaled;
+                doneP = abs(lumaEndP) >= gradientScaled;
+                if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4;
+                if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4;
+                doneNP = (!doneN) || (!doneP);
+                if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4;
+                if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4;
+/*--------------------------------------------------------------------------*/
+                #if (FXAA_QUALITY__PS > 5)
+                if(doneNP) {
+                    if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                    if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                    if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                    if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                    doneN = abs(lumaEndN) >= gradientScaled;
+                    doneP = abs(lumaEndP) >= gradientScaled;
+                    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5;
+                    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5;
+                    doneNP = (!doneN) || (!doneP);
+                    if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5;
+                    if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5;
+/*--------------------------------------------------------------------------*/
+                    #if (FXAA_QUALITY__PS > 6)
+                    if(doneNP) {
+                        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                        doneN = abs(lumaEndN) >= gradientScaled;
+                        doneP = abs(lumaEndP) >= gradientScaled;
+                        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6;
+                        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6;
+                        doneNP = (!doneN) || (!doneP);
+                        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6;
+                        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6;
+/*--------------------------------------------------------------------------*/
+                        #if (FXAA_QUALITY__PS > 7)
+                        if(doneNP) {
+                            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                            doneN = abs(lumaEndN) >= gradientScaled;
+                            doneP = abs(lumaEndP) >= gradientScaled;
+                            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7;
+                            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7;
+                            doneNP = (!doneN) || (!doneP);
+                            if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7;
+                            if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7;
+/*--------------------------------------------------------------------------*/
+    #if (FXAA_QUALITY__PS > 8)
+    if(doneNP) {
+        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+        doneN = abs(lumaEndN) >= gradientScaled;
+        doneP = abs(lumaEndP) >= gradientScaled;
+        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8;
+        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8;
+        doneNP = (!doneN) || (!doneP);
+        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8;
+        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8;
+/*--------------------------------------------------------------------------*/
+        #if (FXAA_QUALITY__PS > 9)
+        if(doneNP) {
+            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+            doneN = abs(lumaEndN) >= gradientScaled;
+            doneP = abs(lumaEndP) >= gradientScaled;
+            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9;
+            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9;
+            doneNP = (!doneN) || (!doneP);
+            if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9;
+            if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9;
+/*--------------------------------------------------------------------------*/
+            #if (FXAA_QUALITY__PS > 10)
+            if(doneNP) {
+                if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                doneN = abs(lumaEndN) >= gradientScaled;
+                doneP = abs(lumaEndP) >= gradientScaled;
+                if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10;
+                if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10;
+                doneNP = (!doneN) || (!doneP);
+                if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10;
+                if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10;
+/*--------------------------------------------------------------------------*/
+                #if (FXAA_QUALITY__PS > 11)
+                if(doneNP) {
+                    if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                    if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                    if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                    if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                    doneN = abs(lumaEndN) >= gradientScaled;
+                    doneP = abs(lumaEndP) >= gradientScaled;
+                    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11;
+                    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11;
+                    doneNP = (!doneN) || (!doneP);
+                    if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11;
+                    if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11;
+/*--------------------------------------------------------------------------*/
+                    #if (FXAA_QUALITY__PS > 12)
+                    if(doneNP) {
+                        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                        doneN = abs(lumaEndN) >= gradientScaled;
+                        doneP = abs(lumaEndP) >= gradientScaled;
+                        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12;
+                        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12;
+                        doneNP = (!doneN) || (!doneP);
+                        if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12;
+                        if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12;
+/*--------------------------------------------------------------------------*/
+                    }
+                    #endif
+/*--------------------------------------------------------------------------*/
+                }
+                #endif
+/*--------------------------------------------------------------------------*/
+            }
+            #endif
+/*--------------------------------------------------------------------------*/
+        }
+        #endif
+/*--------------------------------------------------------------------------*/
+    }
+    #endif
+/*--------------------------------------------------------------------------*/
+                        }
+                        #endif
+/*--------------------------------------------------------------------------*/
+                    }
+                    #endif
+/*--------------------------------------------------------------------------*/
+                }
+                #endif
+/*--------------------------------------------------------------------------*/
+            }
+            #endif
+/*--------------------------------------------------------------------------*/
+        }
+        #endif
+/*--------------------------------------------------------------------------*/
+    }
+/*--------------------------------------------------------------------------*/
+    FxaaFloat dstN = posM.x - posN.x;
+    FxaaFloat dstP = posP.x - posM.x;
+    if(!horzSpan) dstN = posM.y - posN.y;
+    if(!horzSpan) dstP = posP.y - posM.y;
+/*--------------------------------------------------------------------------*/
+    FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero;
+    FxaaFloat spanLength = (dstP + dstN);
+    FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero;
+    FxaaFloat spanLengthRcp = 1.0/spanLength;
+/*--------------------------------------------------------------------------*/
+    FxaaBool directionN = dstN < dstP;
+    FxaaFloat dst = min(dstN, dstP);
+    FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP;
+    FxaaFloat subpixG = subpixF * subpixF;
+    FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5;
+    FxaaFloat subpixH = subpixG * fxaaQualitySubpix;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0;
+    FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH);
+    if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign;
+    if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign;
+    #if (FXAA_DISCARD == 1)
+        return FxaaTexTop(tex, posM);
+    #else
+        return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM);
+    #endif
+}
+/*==========================================================================*/
+#endif
+
+vec4 mainImage(vec2 fragCoord)
+{
+    vec2 rcpFrame = 1./invResolution.xy;
+  	vec2 uv2 = fragCoord.xy / invResolution.xy;
+
+    float fxaaQualitySubpix = 0.75;  // [0..1], default 0.75
+    float fxaaQualityEdgeThreshold = 0.166;  // [0.125..0.33], default 0.166
+    float fxaaQualityEdgeThresholdMin = 0.02;//0.0625; // ?
+    vec4 dummy4 =  vec4(0.0,0.0,0.0,0.0);
+    float dummy1 = 0.0;
+
+    vec4 col = FxaaPixelShader(uv2, dummy4,
+                                    inputTexture, inputTexture, inputTexture,
+                                    rcpFrame, dummy4, dummy4, dummy4,
+                                    fxaaQualitySubpix, fxaaQualityEdgeThreshold,
+                                    fxaaQualityEdgeThresholdMin,
+                                    dummy1, dummy1, dummy1, dummy4);
+
+    vec4 fragColor = vec4( col.xyz, 1. );
+
+    return fragColor;
+}
+
+void main()
+{
+    ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+    for(int i = 0; i < 4; i++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+            ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+            vec4 outColor = mainImage(texelCoord + vec2(0.5));
+            imageStore(imgOutput, texelCoord, outColor);
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl
new file mode 100644
index 0000000000..2201f78c1a
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl
@@ -0,0 +1,1361 @@
+/**
+ * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com)
+ * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com)
+ * Copyright (C) 2013 Belen Masia (bmasia@unizar.es)
+ * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com)
+ * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to
+ * do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software. As clarification, there
+ * is no requirement that the copyright notice and permission be included in
+ * binary distributions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+/**
+ *                  _______  ___  ___       ___           ___
+ *                 /       ||   \/   |     /   \         /   \
+ *                |   (---- |  \  /  |    /  ^  \       /  ^  \
+ *                 \   \    |  |\/|  |   /  /_\  \     /  /_\  \
+ *              ----)   |   |  |  |  |  /  _____  \   /  _____  \
+ *             |_______/    |__|  |__| /__/     \__\ /__/     \__\
+ * 
+ *                               E N H A N C E D
+ *       S U B P I X E L   M O R P H O L O G I C A L   A N T I A L I A S I N G
+ *
+ *                         http://www.iryoku.com/smaa/
+ *
+ * Hi, welcome aboard!
+ * 
+ * Here you'll find instructions to get the shader up and running as fast as
+ * possible.
+ *
+ * IMPORTANTE NOTICE: when updating, remember to update both this file and the
+ * precomputed textures! They may change from version to version.
+ *
+ * The shader has three passes, chained together as follows:
+ *
+ *                           |input|------------------�
+ *                              v                     |
+ *                    [ SMAA*EdgeDetection ]          |
+ *                              v                     |
+ *                          |edgesTex|                |
+ *                              v                     |
+ *              [ SMAABlendingWeightCalculation ]     |
+ *                              v                     |
+ *                          |blendTex|                |
+ *                              v                     |
+ *                [ SMAANeighborhoodBlending ] <------�
+ *                              v
+ *                           |output|
+ *
+ * Note that each [pass] has its own vertex and pixel shader. Remember to use
+ * oversized triangles instead of quads to avoid overshading along the
+ * diagonal.
+ *
+ * You've three edge detection methods to choose from: luma, color or depth.
+ * They represent different quality/performance and anti-aliasing/sharpness
+ * tradeoffs, so our recommendation is for you to choose the one that best
+ * suits your particular scenario:
+ *
+ * - Depth edge detection is usually the fastest but it may miss some edges.
+ *
+ * - Luma edge detection is usually more expensive than depth edge detection,
+ *   but catches visible edges that depth edge detection can miss.
+ *
+ * - Color edge detection is usually the most expensive one but catches
+ *   chroma-only edges.
+ *
+ * For quickstarters: just use luma edge detection.
+ *
+ * The general advice is to not rush the integration process and ensure each
+ * step is done correctly (don't try to integrate SMAA T2x with predicated edge
+ * detection from the start!). Ok then, let's go!
+ *
+ *  1. The first step is to create two RGBA temporal render targets for holding
+ *     |edgesTex| and |blendTex|.
+ *
+ *     In DX10 or DX11, you can use a RG render target for the edges texture.
+ *     In the case of NVIDIA GPUs, using RG render targets seems to actually be
+ *     slower.
+ *
+ *     On the Xbox 360, you can use the same render target for resolving both
+ *     |edgesTex| and |blendTex|, as they aren't needed simultaneously.
+ *
+ *  2. Both temporal render targets |edgesTex| and |blendTex| must be cleared
+ *     each frame. Do not forget to clear the alpha channel!
+ *
+ *  3. The next step is loading the two supporting precalculated textures,
+ *     'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as
+ *     C++ headers, and also as regular DDS files. They'll be needed for the
+ *     'SMAABlendingWeightCalculation' pass.
+ *
+ *     If you use the C++ headers, be sure to load them in the format specified
+ *     inside of them.
+ *
+ *     You can also compress 'areaTex' and 'searchTex' using BC5 and BC4
+ *     respectively, if you have that option in your content processor pipeline.
+ *     When compressing then, you get a non-perceptible quality decrease, and a
+ *     marginal performance increase.
+ *
+ *  4. All samplers must be set to linear filtering and clamp.
+ *
+ *     After you get the technique working, remember that 64-bit inputs have
+ *     half-rate linear filtering on GCN.
+ *
+ *     If SMAA is applied to 64-bit color buffers, switching to point filtering
+ *     when accesing them will increase the performance. Search for
+ *     'SMAASamplePoint' to see which textures may benefit from point
+ *     filtering, and where (which is basically the color input in the edge
+ *     detection and resolve passes).
+ *
+ *  5. All texture reads and buffer writes must be non-sRGB, with the exception
+ *     of the input read and the output write in
+ *     'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in
+ *     this last pass are not possible, the technique will work anyway, but
+ *     will perform antialiasing in gamma space.
+ *
+ *     IMPORTANT: for best results the input read for the color/luma edge 
+ *     detection should *NOT* be sRGB.
+ *
+ *  6. Before including SMAA.h you'll have to setup the render target metrics,
+ *     the target and any optional configuration defines. Optionally you can
+ *     use a preset.
+ *
+ *     You have the following targets available: 
+ *         SMAA_HLSL_3
+ *         SMAA_HLSL_4
+ *         SMAA_HLSL_4_1
+ *         SMAA_GLSL_3 *
+ *         SMAA_GLSL_4 *
+ *
+ *         * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below).
+ *
+ *     And four presets:
+ *         SMAA_PRESET_LOW          (%60 of the quality)
+ *         SMAA_PRESET_MEDIUM       (%80 of the quality)
+ *         SMAA_PRESET_HIGH         (%95 of the quality)
+ *         SMAA_PRESET_ULTRA        (%99 of the quality)
+ *
+ *     For example:
+ *         #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0)
+ *         #define SMAA_HLSL_4
+ *         #define SMAA_PRESET_HIGH
+ *         #include "SMAA.h"
+ *
+ *     Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a
+ *     uniform variable. The code is designed to minimize the impact of not
+ *     using a constant value, but it is still better to hardcode it.
+ *
+ *     Depending on how you encoded 'areaTex' and 'searchTex', you may have to
+ *     add (and customize) the following defines before including SMAA.h:
+ *          #define SMAA_AREATEX_SELECT(sample) sample.rg
+ *          #define SMAA_SEARCHTEX_SELECT(sample) sample.r
+ *
+ *     If your engine is already using porting macros, you can define
+ *     SMAA_CUSTOM_SL, and define the porting functions by yourself.
+ *
+ *  7. Then, you'll have to setup the passes as indicated in the scheme above.
+ *     You can take a look into SMAA.fx, to see how we did it for our demo.
+ *     Checkout the function wrappers, you may want to copy-paste them!
+ *
+ *  8. It's recommended to validate the produced |edgesTex| and |blendTex|.
+ *     You can use a screenshot from your engine to compare the |edgesTex|
+ *     and |blendTex| produced inside of the engine with the results obtained
+ *     with the reference demo.
+ *
+ *  9. After you get the last pass to work, it's time to optimize. You'll have
+ *     to initialize a stencil buffer in the first pass (discard is already in
+ *     the code), then mask execution by using it the second pass. The last
+ *     pass should be executed in all pixels.
+ *
+ *
+ * After this point you can choose to enable predicated thresholding,
+ * temporal supersampling and motion blur integration:
+ *
+ * a) If you want to use predicated thresholding, take a look into
+ *    SMAA_PREDICATION; you'll need to pass an extra texture in the edge
+ *    detection pass.
+ *
+ * b) If you want to enable temporal supersampling (SMAA T2x):
+ *
+ * 1. The first step is to render using subpixel jitters. I won't go into
+ *    detail, but it's as simple as moving each vertex position in the
+ *    vertex shader, you can check how we do it in our DX10 demo.
+ *
+ * 2. Then, you must setup the temporal resolve. You may want to take a look
+ *    into SMAAResolve for resolving 2x modes. After you get it working, you'll
+ *    probably see ghosting everywhere. But fear not, you can enable the
+ *    CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro.
+ *    Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded.
+ *
+ * 3. The next step is to apply SMAA to each subpixel jittered frame, just as
+ *    done for 1x.
+ *
+ * 4. At this point you should already have something usable, but for best
+ *    results the proper area textures must be set depending on current jitter.
+ *    For this, the parameter 'subsampleIndices' of
+ *    'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x
+ *    mode:
+ *
+ *    @SUBSAMPLE_INDICES
+ *
+ *    | S# |  Camera Jitter   |  subsampleIndices    |
+ *    +----+------------------+---------------------+
+ *    |  0 |  ( 0.25, -0.25)  |  float4(1, 1, 1, 0)  |
+ *    |  1 |  (-0.25,  0.25)  |  float4(2, 2, 2, 0)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. S# stands for the
+ *    sample number.
+ *
+ * More information about temporal supersampling here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * c) If you want to enable spatial multisampling (SMAA S2x):
+ *
+ * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be
+ *    created with:
+ *      - DX10:     see below (*)
+ *      - DX10.1:   D3D10_STANDARD_MULTISAMPLE_PATTERN or
+ *      - DX11:     D3D11_STANDARD_MULTISAMPLE_PATTERN
+ *
+ *    This allows to ensure that the subsample order matches the table in
+ *    @SUBSAMPLE_INDICES.
+ *
+ *    (*) In the case of DX10, we refer the reader to:
+ *      - SMAA::detectMSAAOrder and
+ *      - SMAA::msaaReorder
+ *
+ *    These functions allow to match the standard multisample patterns by
+ *    detecting the subsample order for a specific GPU, and reordering
+ *    them appropriately.
+ *
+ * 2. A shader must be run to output each subsample into a separate buffer
+ *    (DX10 is required). You can use SMAASeparate for this purpose, or just do
+ *    it in an existing pass (for example, in the tone mapping pass, which has
+ *    the advantage of feeding tone mapped subsamples to SMAA, which will yield
+ *    better results).
+ *
+ * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing
+ *    the results in the final buffer. The second run should alpha blend with
+ *    the existing final buffer using a blending factor of 0.5.
+ *    'subsampleIndices' must be adjusted as in the SMAA T2x case (see point
+ *    b).
+ *
+ * d) If you want to enable temporal supersampling on top of SMAA S2x
+ *    (which actually is SMAA 4x):
+ *
+ * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is
+ *    to calculate SMAA S2x for current frame. In this case, 'subsampleIndices'
+ *    must be set as follows:
+ *
+ *    | F# | S# |   Camera Jitter    |    Net Jitter     |   subsampleIndices   |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  0 |  0 |  ( 0.125,  0.125)  |  ( 0.375, -0.125) |  float4(5, 3, 1, 3)  |
+ *    |  0 |  1 |  ( 0.125,  0.125)  |  (-0.125,  0.375) |  float4(4, 6, 2, 3)  |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  1 |  2 |  (-0.125, -0.125)  |  ( 0.125, -0.375) |  float4(3, 5, 1, 4)  |
+ *    |  1 |  3 |  (-0.125, -0.125)  |  (-0.375,  0.125) |  float4(6, 4, 2, 4)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. F# stands for the
+ *    frame number. S# stands for the sample number.
+ *
+ * 2. After calculating SMAA S2x for current frame (with the new subsample
+ *    indices), previous frame must be reprojected as in SMAA T2x mode (see
+ *    point b).
+ *
+ * e) If motion blur is used, you may want to do the edge detection pass
+ *    together with motion blur. This has two advantages:
+ *
+ * 1. Pixels under heavy motion can be omitted from the edge detection process.
+ *    For these pixels we can just store "no edge", as motion blur will take
+ *    care of them.
+ * 2. The center pixel tap is reused.
+ *
+ * Note that in this case depth testing should be used instead of stenciling,
+ * as we have to write all the pixels in the motion blur pass.
+ *
+ * That's it!
+ */
+
+//-----------------------------------------------------------------------------
+// SMAA Presets
+
+/**
+ * Note that if you use one of these presets, the following configuration
+ * macros will be ignored if set in the "Configurable Defines" section.
+ */
+
+#if defined(SMAA_PRESET_LOW)
+#define SMAA_THRESHOLD 0.15
+#define SMAA_MAX_SEARCH_STEPS 4
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_MEDIUM)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 8
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_HIGH)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 16
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#define SMAA_CORNER_ROUNDING 25
+#elif defined(SMAA_PRESET_ULTRA)
+#define SMAA_THRESHOLD 0.05
+#define SMAA_MAX_SEARCH_STEPS 32
+#define SMAA_MAX_SEARCH_STEPS_DIAG 16
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+//-----------------------------------------------------------------------------
+// Configurable Defines
+
+/**
+ * SMAA_THRESHOLD specifies the threshold or sensitivity to edges.
+ * Lowering this value you will be able to detect more edges at the expense of
+ * performance. 
+ *
+ * Range: [0, 0.5]
+ *   0.1 is a reasonable value, and allows to catch most visible edges.
+ *   0.05 is a rather overkill value, that allows to catch 'em all.
+ *
+ *   If temporal supersampling is used, 0.2 could be a reasonable value, as low
+ *   contrast edges are properly filtered by just 2x.
+ */
+#ifndef SMAA_THRESHOLD
+#define SMAA_THRESHOLD 0.1
+#endif
+
+/**
+ * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection.
+ * 
+ * Range: depends on the depth range of the scene.
+ */
+#ifndef SMAA_DEPTH_THRESHOLD
+#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD)
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the
+ * horizontal/vertical pattern searches, at each side of the pixel.
+ *
+ * In number of pixels, it's actually the double. So the maximum line length
+ * perfectly handled by, for example 16, is 64 (by perfectly, we meant that
+ * longer lines won't look as good, but still antialiased).
+ *
+ * Range: [0, 112]
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS
+#define SMAA_MAX_SEARCH_STEPS 16
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the
+ * diagonal pattern searches, at each side of the pixel. In this case we jump
+ * one pixel at time, instead of two.
+ *
+ * Range: [0, 20]
+ *
+ * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 
+ * steps), but it can have a significant impact on older machines.
+ *
+ * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing.
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS_DIAG
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#endif
+
+/**
+ * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded.
+ *
+ * Range: [0, 100]
+ *
+ * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing.
+ */
+#ifndef SMAA_CORNER_ROUNDING
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+/**
+ * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times
+ * bigger contrast than current edge, current edge will be discarded.
+ *
+ * This allows to eliminate spurious crossing edges, and is based on the fact
+ * that, if there is too much contrast in a direction, that will hide
+ * perceptually contrast in the other neighbors.
+ */
+#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR
+#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0
+#endif
+
+/**
+ * Predicated thresholding allows to better preserve texture details and to
+ * improve performance, by decreasing the number of detected edges using an
+ * additional buffer like the light accumulation buffer, object ids or even the
+ * depth buffer (the depth buffer usage may be limited to indoor or short range
+ * scenes).
+ *
+ * It locally decreases the luma or color threshold if an edge is found in an
+ * additional buffer (so the global threshold can be higher).
+ *
+ * This method was developed by Playstation EDGE MLAA team, and used in 
+ * Killzone 3, by using the light accumulation buffer. More information here:
+ *     http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx 
+ */
+#ifndef SMAA_PREDICATION
+#define SMAA_PREDICATION 0
+#endif
+
+/**
+ * Threshold to be used in the additional predication buffer. 
+ *
+ * Range: depends on the input, so you'll have to find the magic number that
+ * works for you.
+ */
+#ifndef SMAA_PREDICATION_THRESHOLD
+#define SMAA_PREDICATION_THRESHOLD 0.01
+#endif
+
+/**
+ * How much to scale the global threshold used for luma or color edge
+ * detection when using predication.
+ *
+ * Range: [1, 5]
+ */
+#ifndef SMAA_PREDICATION_SCALE
+#define SMAA_PREDICATION_SCALE 2.0
+#endif
+
+/**
+ * How much to locally decrease the threshold.
+ *
+ * Range: [0, 1]
+ */
+#ifndef SMAA_PREDICATION_STRENGTH
+#define SMAA_PREDICATION_STRENGTH 0.4
+#endif
+
+/**
+ * Temporal reprojection allows to remove ghosting artifacts when using
+ * temporal supersampling. We use the CryEngine 3 method which also introduces
+ * velocity weighting. This feature is of extreme importance for totally
+ * removing ghosting. More information here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * Note that you'll need to setup a velocity buffer for enabling reprojection.
+ * For static geometry, saving the previous depth buffer is a viable
+ * alternative.
+ */
+#ifndef SMAA_REPROJECTION
+#define SMAA_REPROJECTION 0
+#endif
+
+/**
+ * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to
+ * remove ghosting trails behind the moving object, which are not removed by
+ * just using reprojection. Using low values will exhibit ghosting, while using
+ * high values will disable temporal supersampling under motion.
+ *
+ * Behind the scenes, velocity weighting removes temporal supersampling when
+ * the velocity of the subsamples differs (meaning they are different objects).
+ *
+ * Range: [0, 80]
+ */
+#ifndef SMAA_REPROJECTION_WEIGHT_SCALE
+#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0
+#endif
+
+/**
+ * On some compilers, discard cannot be used in vertex shaders. Thus, they need
+ * to be compiled separately.
+ */
+#ifndef SMAA_INCLUDE_VS
+#define SMAA_INCLUDE_VS 1
+#endif
+#ifndef SMAA_INCLUDE_PS
+#define SMAA_INCLUDE_PS 1
+#endif
+
+//-----------------------------------------------------------------------------
+// Texture Access Defines
+
+#ifndef SMAA_AREATEX_SELECT
+#if defined(SMAA_HLSL_3)
+#define SMAA_AREATEX_SELECT(sample) sample.ra
+#else
+#define SMAA_AREATEX_SELECT(sample) sample.rg
+#endif
+#endif
+
+#ifndef SMAA_SEARCHTEX_SELECT
+#define SMAA_SEARCHTEX_SELECT(sample) sample.r
+#endif
+
+#ifndef SMAA_DECODE_VELOCITY
+#define SMAA_DECODE_VELOCITY(sample) sample.rg
+#endif
+
+//-----------------------------------------------------------------------------
+// Non-Configurable Defines
+
+#define SMAA_AREATEX_MAX_DISTANCE 16
+#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20
+#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0))
+#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0)
+#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0)
+#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0)
+#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0)
+
+//-----------------------------------------------------------------------------
+// Porting Functions
+
+#if defined(SMAA_HLSL_3)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0))
+#define SMAASample(tex, coord) tex2D(tex, coord)
+#define SMAASamplePoint(tex, coord) tex2D(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#endif
+#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1)
+SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+#define SMAATexture2D(tex) Texture2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0)
+#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset)
+#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord)
+#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord)
+#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex
+#define SMAALoad(tex, pos, sample) tex.Load(pos, sample)
+#if defined(SMAA_HLSL_4_1)
+#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0)
+#endif
+#endif
+#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset)
+#define SMAASample(tex, coord) texture(tex, coord)
+#define SMAASamplePoint(tex, coord) texture(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset)
+#define SMAA_FLATTEN
+#define SMAA_BRANCH
+#define lerp(a, b, t) mix(a, b, t)
+#define saturate(a) clamp(a, 0.0, 1.0)
+#if defined(SMAA_GLSL_4)
+#define mad(a, b, c) fma(a, b, c)
+#define SMAAGather(tex, coord) textureGather(tex, coord)
+#else
+#define mad(a, b, c) (a * b + c)
+#endif
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define int2 ivec2
+#define int3 ivec3
+#define int4 ivec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#endif
+
+#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL)
+#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL
+#endif
+
+//-----------------------------------------------------------------------------
+// Misc functions
+
+/**
+ * Gathers current pixel, and the top-left neighbors.
+ */
+float3 SMAAGatherNeighbours(float2 texcoord,
+                            float4 offset[3],
+                            SMAATexture2D(tex)) {
+    #ifdef SMAAGather
+    return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb;
+    #else
+    float P = SMAASamplePoint(tex, texcoord).r;
+    float Pleft = SMAASamplePoint(tex, offset[0].xy).r;
+    float Ptop  = SMAASamplePoint(tex, offset[0].zw).r;
+    return float3(P, Pleft, Ptop);
+    #endif
+}
+
+/**
+ * Adjusts the threshold by means of predication.
+ */
+float2 SMAACalculatePredicatedThreshold(float2 texcoord,
+                                        float4 offset[3],
+                                        SMAATexture2D(predicationTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex));
+    float2 delta = abs(neighbours.xx - neighbours.yz);
+    float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta);
+    return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges);
+}
+
+/**
+ * Conditional move:
+ */
+void SMAAMovc(bool2 cond, inout float2 variable, float2 value) {
+    SMAA_FLATTEN if (cond.x) variable.x = value.x;
+    SMAA_FLATTEN if (cond.y) variable.y = value.y;
+}
+
+void SMAAMovc(bool4 cond, inout float4 variable, float4 value) {
+    SMAAMovc(cond.xy, variable.xy, value.xy);
+    SMAAMovc(cond.zw, variable.zw, value.zw);
+}
+
+
+#if SMAA_INCLUDE_VS
+//-----------------------------------------------------------------------------
+// Vertex Shaders
+
+/**
+ * Edge Detection Vertex Shader
+ */
+void SMAAEdgeDetectionVS(float2 texcoord,
+                         out float4 offset[3]) {
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+    offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy);
+}
+
+/**
+ * Blend Weight Calculation Vertex Shader
+ */
+void SMAABlendingWeightCalculationVS(float2 texcoord,
+                                     out float2 pixcoord,
+                                     out float4 offset[3]) {
+    pixcoord = texcoord * SMAA_RT_METRICS.zw;
+
+    // We will use these offsets for the searches later on (see @PSEUDO_GATHER4):
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125,  1.25, -0.125), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125,  1.25), texcoord.xyxy);
+
+    // And these for the searches, they indicate the ends of the loops:
+    offset[2] = mad(SMAA_RT_METRICS.xxyy,
+                    float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS),
+                    float4(offset[0].xz, offset[1].yw));
+}
+
+/**
+ * Neighborhood Blending Vertex Shader
+ */
+void SMAANeighborhoodBlendingVS(float2 texcoord,
+                                out float4 offset) {
+    offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+}
+#endif // SMAA_INCLUDE_VS
+
+#if SMAA_INCLUDE_PS
+//-----------------------------------------------------------------------------
+// Edge Detection Pixel Shaders (First Pass)
+
+/**
+ * Luma Edge Detection
+ *
+ * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAALumaEdgeDetectionPS(float2 texcoord,
+                               float4 offset[3],
+                               SMAATexture2D(colorTex)
+                               #if SMAA_PREDICATION
+                               , SMAATexture2D(predicationTex)
+                               #endif
+                               ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex));
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate lumas:
+    float3 weights = float3(0.2126, 0.7152, 0.0722);
+    float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights);
+
+    float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights);
+    float Ltop  = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights);
+
+    // We do the usual threshold:
+    float4 delta;
+    delta.xy = abs(L - float2(Lleft, Ltop));
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights);
+    float Lbottom  = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights);
+    delta.zw = abs(L - float2(Lright, Lbottom));
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights);
+    float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights);
+    delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop));
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Color Edge Detection
+ *
+ * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAAColorEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(colorTex)
+                                #if SMAA_PREDICATION
+                                , SMAATexture2D(predicationTex)
+                                #endif
+                                ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex);
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate color deltas:
+    float4 delta;
+    float3 C = SMAASamplePoint(colorTex, texcoord).rgb;
+
+    float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb;
+    float3 t = abs(C - Cleft);
+    delta.x = max(max(t.r, t.g), t.b);
+
+    float3 Ctop  = SMAASamplePoint(colorTex, offset[0].zw).rgb;
+    t = abs(C - Ctop);
+    delta.y = max(max(t.r, t.g), t.b);
+
+    // We do the usual threshold:
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb;
+    t = abs(C - Cright);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Cbottom  = SMAASamplePoint(colorTex, offset[1].zw).rgb;
+    t = abs(C - Cbottom);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float3 Cleftleft  = SMAASamplePoint(colorTex, offset[2].xy).rgb;
+    t = abs(C - Cleftleft);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb;
+    t = abs(C - Ctoptop);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Depth Edge Detection
+ */
+float2 SMAADepthEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(depthTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex));
+    float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z));
+    float2 edges = step(SMAA_DEPTH_THRESHOLD, delta);
+
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    return edges;
+}
+
+//-----------------------------------------------------------------------------
+// Diagonal Search Functions
+
+#if !defined(SMAA_DISABLE_DIAG_DETECTION)
+
+/**
+ * Allows to decode two binary values from a bilinear-filtered access.
+ */
+float2 SMAADecodeDiagBilinearAccess(float2 e) {
+    // Bilinear access for fetching 'e' have a 0.25 offset, and we are
+    // interested in the R and G edges:
+    //
+    // +---G---+-------+
+    // |   x o R   x   |
+    // +-------+-------+
+    //
+    // Then, if one of these edge is enabled:
+    //   Red:   (0.75 * X + 0.25 * 1) => 0.25 or 1.0
+    //   Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0
+    //
+    // This function will unpack the values (mad + mul + round):
+    // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1
+    e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75);
+    return round(e);
+}
+
+float4 SMAADecodeDiagBilinearAccess(float4 e) {
+    e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75);
+    return round(e);
+}
+
+/**
+ * These functions allows to perform diagonal pattern searches.
+ */
+float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+
+        // @SearchDiag2Optimization
+        // Fetch both edges at once using bilinear filtering:
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        e = SMAADecodeDiagBilinearAccess(e);
+
+        // Non-optimized version:
+        // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g;
+        // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r;
+
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+/** 
+ * Similar to SMAAArea, this calculates the area corresponding to a certain
+ * diagonal distance and crossing edges 'e'.
+ */
+float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) {
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist);
+
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Diagonal areas are on the second half of the texture:
+    texcoord.x += 0.5;
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset;
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+/**
+ * This searches for diagonal patterns and returns the corresponding weights.
+ */
+float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) {
+    float2 weights = float2(0.0, 0.0);
+
+    // Search for the line ends:
+    float4 d;
+    float2 end;
+    if (e.r > 0.0) {
+        d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0,  1.0), end);
+        d.x += float(end.y > 0.9);
+    } else
+        d.xz = float2(0.0, 0.0);
+    d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).rg;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).rg;
+        c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw);
+
+        // Non-optimized version:
+        // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        // float4 c;
+        // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0,  0)).r;
+        // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).g;
+        // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r;
+
+        // Merge crossing edges at each side into a single value:
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z);
+    }
+
+    // Search for the line ends:
+    d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end);
+    if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) {
+        d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end);
+        d.y += float(end.y > 0.9);
+    } else
+        d.yw = float2(0.0, 0.0);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.x  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        c.y  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).gr;
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr;
+    }
+
+    return weights;
+}
+#endif
+
+//-----------------------------------------------------------------------------
+// Horizontal/Vertical Search Functions
+
+/**
+ * This allows to determine how much length should we add in the last step
+ * of the searches. It takes the bilinearly interpolated edge (see 
+ * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and
+ * crossing edges are active.
+ */
+float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) {
+    // The texture is flipped vertically, with left and right cases taking half
+    // of the space horizontally:
+    float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0);
+    float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0);
+
+    // Scale and bias to access texel centers:
+    scale += float2(-1.0,  1.0);
+    bias  += float2( 0.5, -0.5);
+
+    // Convert from pixel coordinates to texcoords:
+    // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped)
+    scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+    bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+
+    // Lookup the search texture:
+    return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias)));
+}
+
+/**
+ * Horizontal/vertical search functions for the 2nd pass.
+ */
+float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    /**
+     * @PSEUDO_GATHER4
+     * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to
+     * sample between edge, thus fetching four edges in a row.
+     * Sampling with different offsets in each direction allows to disambiguate
+     * which edges are active from the four fetched ones.
+     */
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x > end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+
+    // Non-optimized version:
+    // We correct the previous (-0.25, -0.125) offset we applied:
+    // texcoord.x += 0.25 * SMAA_RT_METRICS.x;
+
+    // The searches are bias by 1, so adjust the coords accordingly:
+    // texcoord.x += SMAA_RT_METRICS.x;
+
+    // Disambiguate the length added by the last step:
+    // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step
+    // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0);
+    // return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x < end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y > end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y < end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+/** 
+ * Ok, we have the distance and both crossing edges. So, what are the areas
+ * at each side of current edge?
+ */
+float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) {
+    // Rounding prevents precision errors of bilinear filtering:
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist);
+    
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y);
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+//-----------------------------------------------------------------------------
+// Corner Detection Functions
+
+void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line.
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0,  1)).r;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1,  1)).r;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y;
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Blending Weight Calculation Pixel Shader (Second Pass)
+
+float4 SMAABlendingWeightCalculationPS(float2 texcoord,
+                                       float2 pixcoord,
+                                       float4 offset[3],
+                                       SMAATexture2D(edgesTex),
+                                       SMAATexture2D(areaTex),
+                                       SMAATexture2D(searchTex),
+                                       float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES.
+    float4 weights = float4(0.0, 0.0, 0.0, 0.0);
+
+    float2 e = SMAASample(edgesTex, texcoord).rg;
+
+    SMAA_BRANCH
+    if (e.g > 0.0) { // Edge at north
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        // Diagonals have both north and west edges, so searching for them in
+        // one of the boundaries is enough.
+        weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices);
+
+        // We give priority to diagonals, so if we find a diagonal we skip 
+        // horizontal/vertical processing.
+        SMAA_BRANCH
+        if (weights.r == -weights.g) { // weights.r + weights.g == 0.0
+        #endif
+
+        float2 d;
+
+        // Find the distance to the left:
+        float3 coords;
+        coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x);
+        coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET)
+        d.x = coords.x;
+
+        // Now fetch the left crossing edges, two at a time using bilinear
+        // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to
+        // discern what value each edge has:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r;
+
+        // Find the distance to the right:
+        coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units (doing this here allow to
+        // better interleave arithmetic and memory accesses):
+        d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the right crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r;
+
+        // Ok, we know how this pattern looks like, now it is time for getting
+        // the actual area:
+        weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y);
+
+        // Fix corners:
+        coords.y = texcoord.y;
+        SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d);
+
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        } else
+            e.r = 0.0; // Skip vertical processing.
+        #endif
+    }
+
+    SMAA_BRANCH
+    if (e.r > 0.0) { // Edge at west
+        float2 d;
+
+        // Find the distance to the top:
+        float3 coords;
+        coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z);
+        coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x;
+        d.x = coords.y;
+
+        // Fetch the top crossing edges:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g;
+
+        // Find the distance to the bottom:
+        coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units:
+        d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed 
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the bottom crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g;
+
+        // Get the area for this direction:
+        weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x);
+
+        // Fix corners:
+        coords.x = texcoord.x;
+        SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d);
+    }
+
+    return weights;
+}
+
+//-----------------------------------------------------------------------------
+// Neighborhood Blending Pixel Shader (Third Pass)
+
+float4 SMAANeighborhoodBlendingPS(float2 texcoord,
+                                  float4 offset,
+                                  SMAATexture2D(colorTex),
+                                  SMAATexture2D(blendTex)
+                                  #if SMAA_REPROJECTION
+                                  , SMAATexture2D(velocityTex)
+                                  #endif
+                                  ) {
+    // Fetch the blending weights for current pixel:
+    float4 a;
+    a.x = SMAASample(blendTex, offset.xy).a; // Right
+    a.y = SMAASample(blendTex, offset.zw).g; // Top
+    a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left
+
+    // Is there any blending weight with a value greater than 0.0?
+    SMAA_BRANCH
+    if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) {
+        float4 color = SMAASampleLevelZero(colorTex, texcoord);
+
+        #if SMAA_REPROJECTION
+        float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    } else {
+        bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical)
+
+        // Calculate the blending offsets:
+        float4 blendingOffset = float4(0.0, a.y, 0.0, a.w);
+        float2 blendingWeight = a.yw;
+        SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0));
+        SMAAMovc(bool2(h, h), blendingWeight, a.xz);
+        blendingWeight /= dot(blendingWeight, float2(1.0, 1.0));
+
+        // Calculate the texture coordinates:
+        float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy);
+
+        // We exploit bilinear filtering to mix current pixel with the chosen
+        // neighbor:
+        float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy);
+        color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw);
+
+        #if SMAA_REPROJECTION
+        // Antialias velocity for proper reprojection in a later stage:
+        float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy));
+        velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    }
+}
+
+//-----------------------------------------------------------------------------
+// Temporal Resolve Pixel Shader (Optional Pass)
+
+float4 SMAAResolvePS(float2 texcoord,
+                     SMAATexture2D(currentColorTex),
+                     SMAATexture2D(previousColorTex)
+                     #if SMAA_REPROJECTION
+                     , SMAATexture2D(velocityTex)
+                     #endif
+                     ) {
+    #if SMAA_REPROJECTION
+    // Velocity is assumed to be calculated for motion blur, so we need to
+    // inverse it for reprojection:
+    float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg);
+
+    // Fetch current pixel:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+
+    // Reproject current coordinates and fetch previous pixel:
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity);
+
+    // Attenuate the previous pixel if the velocity is different:
+    float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0;
+    float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE);
+
+    // Blend the pixels according to the calculated weight:
+    return lerp(current, previous, weight);
+    #else
+    // Just blend the pixels:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord);
+    return lerp(current, previous, 0.5);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Separate Multisamples Pixel Shader (Optional Pass)
+
+#ifdef SMAALoad
+void SMAASeparatePS(float4 position,
+                    float2 texcoord,
+                    out float4 target0,
+                    out float4 target1,
+                    SMAATexture2DMS2(colorTexMS)) {
+    int2 pos = int2(position.xy);
+    target0 = SMAALoad(colorTexMS, pos, 0);
+    target1 = SMAALoad(colorTexMS, pos, 1);
+}
+#endif
+
+//-----------------------------------------------------------------------------
+#endif // SMAA_INCLUDE_PS
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl
new file mode 100644
index 0000000000..c875ce127d
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl
@@ -0,0 +1,26 @@
+layout(rgba8, binding = 0) uniform image2D imgOutput;
+
+uniform sampler2D inputTexture;
+layout( location=0 ) uniform vec2 invResolution;
+uniform sampler2D samplerArea;
+uniform sampler2D samplerSearch;
+
+void main() {
+    ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+    for(int i = 0; i < 4; i++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+            ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+            vec2 coord = (texelCoord + vec2(0.5)) / invResolution;
+            vec2 pixCoord;
+            vec4 offset[3];
+
+            SMAABlendingWeightCalculationVS(coord, pixCoord, offset);
+
+            vec4 oColor = SMAABlendingWeightCalculationPS(coord, pixCoord, offset, inputTexture, samplerArea, samplerSearch, ivec4(0));    
+
+            imageStore(imgOutput, texelCoord, oColor);
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl
new file mode 100644
index 0000000000..fd5d971542
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl
@@ -0,0 +1,24 @@
+layout(rgba8, binding = 0) uniform image2D imgOutput;
+
+uniform sampler2D inputTexture;
+layout( location=0 ) uniform vec2 invResolution;
+
+void main() 
+{
+    vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+    for(int i = 0; i < 4; i++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+            ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+            vec2 coord = (texelCoord + vec2(0.5)) / invResolution;
+            vec4 offset[3];
+            SMAAEdgeDetectionVS(coord, offset);
+            vec2 oColor = SMAAColorEdgeDetectionPS(coord, offset, inputTexture);
+            if (oColor != float2(-2.0, -2.0))
+            {
+                imageStore(imgOutput, texelCoord, vec4(oColor, 0.0, 1.0));
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl
new file mode 100644
index 0000000000..2e9432ae65
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl
@@ -0,0 +1,26 @@
+layout(rgba8, binding = 0) uniform image2D imgOutput;
+
+uniform sampler2D inputTexture;
+layout( location=0 ) uniform vec2 invResolution;
+uniform sampler2D samplerBlend;
+
+void main() {
+    vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+    for(int i = 0; i < 4; i++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+            ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+            vec2 coord = (texelCoord + vec2(0.5)) / invResolution;
+            vec2 pixCoord;
+            vec4 offset;
+            
+            SMAANeighborhoodBlendingVS(coord, offset);
+
+            vec4 oColor = SMAANeighborhoodBlendingPS(coord, offset, inputTexture, samplerBlend);
+
+            imageStore(imgOutput,  texelCoord, oColor);
+        }
+    }
+
+}
diff --git a/Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs b/Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs
new file mode 100644
index 0000000000..1ad300c888
--- /dev/null
+++ b/Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs
@@ -0,0 +1,261 @@
+using OpenTK.Graphics.OpenGL;
+using Ryujinx.Common;
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.OpenGL.Image;
+using System;
+
+namespace Ryujinx.Graphics.OpenGL.Effects.Smaa
+{
+    internal partial class SmaaPostProcessingEffect : IPostProcessingEffect
+    {
+        public const int AreaWidth = 160;
+        public const int AreaHeight = 560;
+        public const int SearchWidth = 64;
+        public const int SearchHeight = 16;
+
+        private readonly OpenGLRenderer _renderer;
+        private TextureStorage _outputTexture;
+        private TextureStorage _searchTexture;
+        private TextureStorage _areaTexture;
+        private int[] _edgeShaderPrograms;
+        private int[] _blendShaderPrograms;
+        private int[] _neighbourShaderPrograms;
+        private TextureStorage _edgeOutputTexture;
+        private TextureStorage _blendOutputTexture;
+        private string[] _qualities;
+        private int _inputUniform;
+        private int _outputUniform;
+        private int _samplerAreaUniform;
+        private int _samplerSearchUniform;
+        private int _samplerBlendUniform;
+        private int _resolutionUniform;
+        private int _quality = 1;
+
+        public int Quality
+        {
+            get => _quality; set
+            {
+                _quality = Math.Clamp(value, 0, _qualities.Length - 1);
+            }
+        }
+        public SmaaPostProcessingEffect(OpenGLRenderer renderer, int quality)
+        {
+            _renderer = renderer;
+
+            _edgeShaderPrograms = Array.Empty<int>();
+            _blendShaderPrograms = Array.Empty<int>();
+            _neighbourShaderPrograms = Array.Empty<int>();
+
+            _qualities = new string[] { "SMAA_PRESET_LOW", "SMAA_PRESET_MEDIUM", "SMAA_PRESET_HIGH", "SMAA_PRESET_ULTRA" };
+
+            Quality = quality;
+
+            Initialize();
+        }
+
+        public void Dispose()
+        {
+            _searchTexture?.Dispose();
+            _areaTexture?.Dispose();
+            _outputTexture?.Dispose();
+            _edgeOutputTexture?.Dispose();
+            _blendOutputTexture?.Dispose();
+
+            DeleteShaders();
+        }
+
+        private void DeleteShaders()
+        {
+            for (int i = 0; i < _edgeShaderPrograms.Length; i++)
+            {
+                GL.DeleteProgram(_edgeShaderPrograms[i]);
+                GL.DeleteProgram(_blendShaderPrograms[i]);
+                GL.DeleteProgram(_neighbourShaderPrograms[i]);
+            }
+        }
+
+        private unsafe void RecreateShaders(int width, int height)
+        {
+            string baseShader = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl");
+            var pixelSizeDefine = $"#define SMAA_RT_METRICS float4(1.0 / {width}.0, 1.0 / {height}.0, {width}, {height}) \n";
+
+            _edgeShaderPrograms = new int[_qualities.Length];
+            _blendShaderPrograms = new int[_qualities.Length];
+            _neighbourShaderPrograms = new int[_qualities.Length];
+
+            for (int i = 0; i < +_edgeShaderPrograms.Length; i++)
+            {
+                var presets = $"#version 430 core \n#define {_qualities[i]} 1 \n{pixelSizeDefine}#define SMAA_GLSL_4 1 \nlayout (local_size_x = 16, local_size_y = 16) in;\n{baseShader}";
+
+                var edgeShaderData = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl");
+                var blendShaderData = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl");
+                var neighbourShaderData = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl");
+
+                var shaders = new string[] { presets, edgeShaderData };
+                var edgeProgram = ShaderHelper.CompileProgram(shaders, ShaderType.ComputeShader);
+
+                shaders[1] = blendShaderData;
+                var blendProgram = ShaderHelper.CompileProgram(shaders, ShaderType.ComputeShader);
+
+                shaders[1] = neighbourShaderData;
+                var neighbourProgram = ShaderHelper.CompileProgram(shaders, ShaderType.ComputeShader);
+
+                _edgeShaderPrograms[i] = edgeProgram;
+                _blendShaderPrograms[i] = blendProgram;
+                _neighbourShaderPrograms[i] = neighbourProgram;
+            }
+
+            _inputUniform = GL.GetUniformLocation(_edgeShaderPrograms[0], "inputTexture");
+            _outputUniform = GL.GetUniformLocation(_edgeShaderPrograms[0], "imgOutput");
+            _samplerAreaUniform = GL.GetUniformLocation(_blendShaderPrograms[0], "samplerArea");
+            _samplerSearchUniform = GL.GetUniformLocation(_blendShaderPrograms[0], "samplerSearch");
+            _samplerBlendUniform = GL.GetUniformLocation(_neighbourShaderPrograms[0], "samplerBlend");
+            _resolutionUniform = GL.GetUniformLocation(_edgeShaderPrograms[0], "invResolution");
+        }
+
+        private void Initialize()
+        {
+            var areaInfo = new TextureCreateInfo(AreaWidth,
+                AreaHeight,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                Format.R8G8Unorm,
+                DepthStencilMode.Depth,
+                Target.Texture2D,
+                SwizzleComponent.Red,
+                SwizzleComponent.Green,
+                SwizzleComponent.Blue,
+                SwizzleComponent.Alpha);
+
+            var searchInfo = new TextureCreateInfo(SearchWidth,
+                SearchHeight,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                Format.R8Unorm,
+                DepthStencilMode.Depth,
+                Target.Texture2D,
+                SwizzleComponent.Red,
+                SwizzleComponent.Green,
+                SwizzleComponent.Blue,
+                SwizzleComponent.Alpha);
+
+            _areaTexture = new TextureStorage(_renderer, areaInfo, 1);
+            _searchTexture = new TextureStorage(_renderer, searchInfo, 1);
+
+            var areaTexture = EmbeddedResources.Read("Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin");
+            var searchTexture = EmbeddedResources.Read("Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin");
+
+            var areaView = _areaTexture.CreateDefaultView();
+            var searchView = _searchTexture.CreateDefaultView();
+
+            areaView.SetData(areaTexture);
+            searchView.SetData(searchTexture);
+        }
+
+        public TextureView Run(TextureView view, int width, int height)
+        {
+            if (_outputTexture == null || _outputTexture.Info.Width != view.Width || _outputTexture.Info.Height != view.Height)
+            {
+                _outputTexture?.Dispose();
+                _outputTexture = new TextureStorage(_renderer, view.Info, view.ScaleFactor);
+                _outputTexture.CreateDefaultView();
+                _edgeOutputTexture = new TextureStorage(_renderer, view.Info, view.ScaleFactor);
+                _edgeOutputTexture.CreateDefaultView();
+                _blendOutputTexture = new TextureStorage(_renderer, view.Info, view.ScaleFactor);
+                _blendOutputTexture.CreateDefaultView();
+
+                DeleteShaders();
+
+                RecreateShaders(view.Width, view.Height);
+            }
+
+            var textureView = _outputTexture.CreateView(view.Info, 0, 0) as TextureView;
+            var edgeOutput = _edgeOutputTexture.DefaultView as TextureView;
+            var blendOutput = _blendOutputTexture.DefaultView as TextureView;
+            var areaTexture = _areaTexture.DefaultView as TextureView;
+            var searchTexture = _searchTexture.DefaultView as TextureView;
+
+            var previousFramebuffer = GL.GetInteger(GetPName.FramebufferBinding);
+            int previousUnit = GL.GetInteger(GetPName.ActiveTexture);
+            GL.ActiveTexture(TextureUnit.Texture0);
+            int previousTextureBinding0 = GL.GetInteger(GetPName.TextureBinding2D);
+            GL.ActiveTexture(TextureUnit.Texture1);
+            int previousTextureBinding1 = GL.GetInteger(GetPName.TextureBinding2D);
+            GL.ActiveTexture(TextureUnit.Texture2);
+            int previousTextureBinding2 = GL.GetInteger(GetPName.TextureBinding2D);
+
+            var framebuffer = new Framebuffer();
+            framebuffer.Bind();
+            framebuffer.AttachColor(0, edgeOutput);
+            GL.Clear(ClearBufferMask.ColorBufferBit);
+            GL.ClearColor(0, 0, 0, 0);
+            framebuffer.AttachColor(0, blendOutput);
+            GL.Clear(ClearBufferMask.ColorBufferBit);
+            GL.ClearColor(0, 0, 0, 0);
+
+            GL.BindFramebuffer(FramebufferTarget.Framebuffer, previousFramebuffer);
+
+            framebuffer.Dispose();
+
+            var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize);
+            var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize);
+
+            int previousProgram = GL.GetInteger(GetPName.CurrentProgram);
+            GL.BindImageTexture(0, edgeOutput.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8);
+            GL.UseProgram(_edgeShaderPrograms[Quality]);
+            view.Bind(0);
+            GL.Uniform1(_inputUniform, 0);
+            GL.Uniform1(_outputUniform, 0);
+            GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height);
+            GL.DispatchCompute(dispatchX, dispatchY, 1);
+            GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit);
+
+            GL.BindImageTexture(0, blendOutput.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8);
+            GL.UseProgram(_blendShaderPrograms[Quality]);
+            edgeOutput.Bind(0);
+            areaTexture.Bind(1);
+            searchTexture.Bind(2);
+            GL.Uniform1(_inputUniform, 0);
+            GL.Uniform1(_outputUniform, 0);
+            GL.Uniform1(_samplerAreaUniform, 1);
+            GL.Uniform1(_samplerSearchUniform, 2);
+            GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height);
+            GL.DispatchCompute(dispatchX, dispatchY, 1);
+            GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit);
+
+            GL.BindImageTexture(0, textureView.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8);
+            GL.UseProgram(_neighbourShaderPrograms[Quality]);
+            view.Bind(0);
+            blendOutput.Bind(1);
+            GL.Uniform1(_inputUniform, 0);
+            GL.Uniform1(_outputUniform, 0);
+            GL.Uniform1(_samplerBlendUniform, 1);
+            GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height);
+            GL.DispatchCompute(dispatchX, dispatchY, 1);
+            GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit);
+
+            (_renderer.Pipeline as Pipeline).RestoreImages1And2();
+
+            GL.UseProgram(previousProgram);
+
+            GL.ActiveTexture(TextureUnit.Texture0);
+            GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding0);
+            GL.ActiveTexture(TextureUnit.Texture1);
+            GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding1);
+            GL.ActiveTexture(TextureUnit.Texture2);
+            GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding2);
+
+            GL.ActiveTexture((TextureUnit)previousUnit);
+
+            return textureView;
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin b/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f4a7a1b417766c12bbac4e4bdc56796f18538bd6
GIT binary patch
literal 179200
zcmdSChkqN_mHs{GL?MVqfW3FHfnW!V2!g%$UL;B)B~rcj*s?5HvMgIJaxaPFIB^oE
z$4Q**W;dH`Nwy@L&2IMd`(NJY+?hck0nA9^T7EGfTO<ZE!~4Z^&bf2v-g7uk;*~A2
zUt;%T9?tD)?P&G0|NLS9($A5<2QGCi`6oL@`~&_ze{V;RzuVu1Ex@+Uj!wU@sjua=
zx?8JTy{&bv4Xur>P1u^T`PkOdDsFqO$-AyZ^Es}aGe~|W&IcazaIQlpzkNAFWxdq_
zZ=fzv-`U`A>}>Kk`J1u%XzTF#TlkGzGW9t-GduG36t$POm$z58RkeBAYTCTmYTN3t
z)weZZlY8AbP3LzV1VNSLr(%BKF%Or`?}%~6Ig;6**IU$6+U*Wh1gZkn0Z*W&)7x3w
zS?jOstoPUZ8~hF2hITS>z|oT$$n4DN$nPlHQ@p3N-3^iz?Un6S?bYqojOVWLJIO6_
z6`Ud_Ac-B1c{s=0F4<C~y>9>gq;@57+A)$kl-Zxtm)}#^UEE#TRn}Dwl9hp~&T69B
z37Xsy^@4uPJZSHAb*F)0wm;8b&{5b?yr*PO>7KG6Nx9dRXg<fC=MHndT%P3TVtn8+
z50|aKGphaidGn-w%r%@gnAw-zm)A=ai@Hm?z%k$s@FXj_<H}Xtv~e_P(AMYdN$F1S
z$_!+8<_Z)^jXcL))89qz1UJVubLlYvNxc1-hs)-7PPw67G0rAU+QyPcQU=opGW)ap
za(e}eUBz7jNA9F@zh)s}(lnYhWb1eIx_VN1hJhS^kRsH$Yy2*AXSg-4pR4Aa5~!l?
ze$2yV^E<EH)a*-GFwGD_$FOTCl`+if%jpHh?t*U8VsTe7cUo~!y{wxtOeBt2hHL|l
zK4))AcUl+eFe{M5l$Z~T-bH?7g42j1C7eAfqPTBfW)9$y50}mFg5rq!fNt3^pE%73
zjyQ)?2Ga&ehgp5u#1QtJ-(AR^Q5;h3)y^kO8^;BL{Z6LBw5|-sFq=`_HGWsP3*1p|
znH%65xO~nM7gm&=k9j!wg~tssj_}bV)Gze6UQ`@Yt!q~kmW*?W)0PSAm}4}VsgN=3
z1w&qm+*!q@YE84Go7GR6#?8>6b<p0Q+~?{|VU{d1lzZJMO^$r)3U`q^$?fMRIX`ST
zn=?o5mip~650}mFlH!E&kY-J{qF*r0nx{Z;+zx`I!qmaE{)~alKE{wc$89MOs8^uC
z8N;M$EJP4?OqL992!>s|#_uY3i95|5;ufGlALm9KNt9w1zW*@~m(LG@VN<<dyEkFU
zFmIX;5`-PQhEfL7uo1&d@H?tJsNSnt)XnLqjT4DuNh6kFXmG&MpWG{`5Da&b-(~JB
zcZA!^O>hCOjw|L;Ia4^H_&;Ce9_w(~{4R5+lv}EUP~ftD(Ku_G5eYg*V8??@g~Sm2
zjwv@_!OKj6Qy@4RMv$TjRG8YmtNd_H?f`l*gIpVII3JPN7@t~bFY$Yf!ykg*Y2{JX
zhGswcK*PLoHZiC{c)}sFW5y8S>^LIGe)X!BS0KD#h#)+npu%0`cLmox!EGRhjB{OF
z6IagVaLID-im!5yb+~MPR}ff_DG#X+X!ns1G|rpC2-=}RiY8!4ZBZOnkp(Y7fwOv2
z;JA4-X_#3t#S%mlXYa1^yTF~|Ho1M=3>3JBtK~|$Ec9jcA(h4ciTFLn;Sa&@tm3%x
zuxeeiMhXPMInzwylwidph$UbM72=OMq1b{4+^1P#3Y;+z!7(CeB|pe}!v0<5cM*N)
zqp;uwZj9?j6shKl5JzlWLTqm09^&^Hhd&g*6Yzj!!F$OE!iGW641(5i>zHlKK1!Z2
z8T?Kvj*th01uy9qpulP4q$!M`jS-a1FB?7MgXp20M>8*DyMpa%*mi}R<&MLJT|gX;
zwqFDl<KJwF^GofQ+m0*TImJok5!E5}K`nYQ2^2+O!$dG?(mG}xX9P!y-zo4rq&ff#
zUJfd7GLa&&pg|%i_qu4d0R$Iuk8uJjJCE~U3?oJK0^Ah04*i{sx<3Xt@okL%#XY~&
ze!1;{mj%D0%z_a^-~$&8<OOG_FO!7cAWzWF_#Fj5-hxF1GB1cY!V?^_4a&VPYo`h$
zRK19=YoJJ+PNT<iPGET)^l}h)N4fc^`$bSO{$=BTan3KbU+#U>Rq#6new(o11F+x~
z){kK}j7U5Q4e|uR?=*Ta6hYQBt5Dzq`hgThreMPqi3NgkuPZcLff1@^j1o*E%I-&R
z`3QO}ClEExfa5yoCF1Tk`u-T)#J7d<znJHj+Ap^q;D@1sW8?wV2Q>SLpnj3!F!co~
zjtB%9zvGHc<vRK>dl5r;1yU3dy&xkf_qt*`@ozffLObp<CJ`t0B3f)9<{riMjzW7S
z7(tD`KkNk2eheG`i+a4&{@f+ghW3d1gz}{FwBn57tl}KLQL+v4G@<RHe~)X*w3={G
zb69;ubxe6ec}j6gvHb`->XN)o?sX&D>;$I<+~*G9USkeQ+y{!6V9)bx{7->?^UnPd
zC)oK*#Q0y7<E8ey`<xSr3;I>ve$Bf2kZM!41tUGCJgzu_?Iak15-jq}jI*LV-Dm4c
z95PNM%;*+0%bLCFHT412x^hE#NV%!pQXEknRUDH?kGmxIy4zZ8$K6FWdME+(^v2QO
zTR_YWAOG9Af7=PR{~CEXdX3x8FSXy(nK|GXGfx{A5|(v)wQHIK>UH&o>X7O%NFKq)
z(>%V{<V!Cp$*gfUTRY4>rUAo<enK~;oz={17B$Q2RW)d;_A3v{<tO*L+uMr~s%(rB
z)WPR<f#DeVMUMY%-@oGok#B52eEa#O_Io?>x>E=3qe+vd8N+<Sl5R!2SF@(suRe&4
zI1)*yYhBl6tH~@X%BoDMbu?SslLDq5<A7mEKc*W8#~BSHsa}%HPwsUiTg!ek!_yT*
zg-y8E=w`PMk>h`n`=gv7{H-YCe-Xz^?XTNY9LVf-4cbPNCK9KOv-<gjMcuM?6%_Yr
zV4sZRx@u!nGnD9aR%R6zX1mio&U$;Zr7g*y*k$T5^cx2CBRb+ZshyU~PwsWY+Zy~i
z5o0X*xX-PHB}W^;3*R3+DEbf44o5T(o?mKzLwjjwZg*OrbI3Mo8BZh?&M}5d+La*1
z{ptfp)f0veYi)`<yPzPuG`-U0O>VF?TUwL$BzBs*OudZaux>;yKe^Y9-fE0cInnDY
zg$kn&;Dz?bIDz<f^ut5vm)hUhUfz)($m~h&cMRD^EaT=$)3h<j5K1I_h8B;jW)phM
z%??jmDG272WK^VjoV5;~VY}IH3YfZ$(4t&^a<414bH~3%jIm_lJ~!3?p16O<fst>-
zIvgWl?EF&uo7yWoiaK+;(|cV5jNq7gB5}${mOQ6lNLbJ<F@~!rlnc5+Q@gd^S(#p%
zQ;?rilIc#Za@9ENY{al7somUR@|(Kk@{@br$ks;uW<az|XZ)fM;PLw-4%zujobf+)
zywv`twyHfP{=6<Iu+IsC))C7XJmI8i3Mv#C!XutiE^9^%fuv@;H>Dz@I43_pr#Q1L
zts<q`>2=iE8X3d3qz>8qWL|e?i^Ja0V~i!<03Pj^c*Q#$BVabE{mpHjJ*EEqKsFSZ
z(w{tNCxW9%<G9}^6$TleR_xPEB=nfttPRPYRCgvLSd>|oUY=UzgeBXFp)biNm!I70
zN;E%#@rylxCpm&ed}1HYB}naWZmVf`cNBK!c4c;_^}2}QFi#M6Oe!Qxh7Qjt_N%9L
z14e&Rv#r)ym0Ffr1P$gEWtOJ9Q!Amun&dhg7|Q1-@46EFIL7ZW2Jnbayu+pT``WyS
zAjST?Kz3IKY&eV{c|sz{Ecq-Ri_dC@3|)zBmIg<St0D~qbMx{*uq4BsTAo_%WR@(K
zpWN$8@_WJ`pmtQi0~Q2wm;zJC3z7zhVa4PL$&R7IbBcA<yk<<_YuW<^);X(F%F|0l
zg2kEcG^o&3?eshhKRMTZ!gwSv?`x}VuiR5g797YT1(FvO2wG{hi98`xcuujQT+)o|
z`VF0lt(Hcc7dGrpFA)f4!4rZYqKSNd@~$h{fG7NcmbSX~sy$^LMX+E}AfgC)!2vv{
zpyw4L!HIL+Vdb)VN;hce;uYxdI4e`#8O51JdAWJHg&;_t&@Gpr-0MDJJd&5Ec2u{!
z_Z0gJcm*Pgz=kP~gb2bDp651|tLho;2rSrdZh-=8VZ#-O#6?+!B0+e<hv6sZx{?ie
z!XId9s|UaGJtZB5EQUaVybW(7c%D0=+^e3|jKP8frgrjyjv8lms+%Hlh+wgNe)6vS
zgz-pTz6EoUJ%}Ksu;6@HFnk~>kZiagab(aYSTXTiQ_pKAV8Pv{j--~PMv5Y2!)b^k
z0>SJe`TXQvSF!<5_yfeR1`(tTeHgOfZ1RCUf(=s~8R7|^=Z-24s1`Mox?z2<vD4I+
z<g?Wyid4H`!xTq&f&~x5PtJ9pFdoUv6F)CzcDn@&<`tOUOK})pkm3j<c!4{nIH+7w
zPiseD!Cg?Gj}(X~Qk{aZL2)Fz0Dkac_{q7hWCNb?2ljDW?13OXfaV{qh8{y-!Qig=
zsQM}HY3@4r6#k=)eh%Nn_pjrTe*h23_Tr%!J={6Y9+F}B9}gVqk)!x%<vc#MhAYz}
z-Dp>)$K?E#pD-TD%MTi7v@5DLJjB>!4=_%E<|%sM!CHKlJxHQQNl(k<$L(>s%1u69
zmu5&kp_)-HDwh>|6>Exvigm>X9z1S>+7XcL$Qd%uYL_wOPgmYj98-i}nW)mksPmG4
z|C9bePx46Ow0=&zqS?nD2C+v$N<0Ws&?BVd3ZCX^L35(cV(X8aGL@&N7u#wQn~fa_
zJ-R{7h-N}Pt)5lQE0>hZ%2g%N+^^^;>v4`GPU;ufmG|Q@RQQ$Y+WeKDG#{z!cV+fF
zM$MDPISkulz!?ud@JLOyp*&2F)Ue_45hxy4@HQ)h-)e%-R+5=n<gB#Tnww4Sh5#rI
zXooaonsN1%dPY5~o>wg@{ncG~a5Z9{Vpm?qAZ_@SH~9xwxUSUS|D^W^@_JK;f{%}g
zA^-S@Q4DfCqLj_=pr$La##xk=RhZ&-c&znF&89Y^-_WJ+(e>*FwIjq)GpX+I(v=78
zW9-4zT-bvvy7Ic}knjyYX+Bcd?=0%c>~{^>&<^(aXo0n4Rf~s5f+C<sw1$rsGW2&?
zJ!EXQR;CnYXBVWEIxFp7YeQ0VVykJ7p%WB)iK1>;+fheX?sE;ZD?^1C?52l*{FT{*
zWwL|ESlK4k^4;0rS=yD;mo|_*BD7=HFdu9QJ%Zbh2f_zb2UW899Z`)ZbeL-$?z93B
zOfPX&BztVN7J(s@7%+50iyifyrQJEbX@l&_?9pG?l~;pT-knzd-<_q?-Id>)*_Sfp
z071)ydCD|noI^_%NeyJpyb?9C`5jZv=z2`eR*$nZy&xwiKcgho?W}Ut*y=2eNzLXK
zQ@g1H6gwK|%Fuq)D=+Kt0F_-?VtO8zogIIApcFZ&>4Q?|@2m(Ec4zmd54eULqd|hR
zMyL=B;Sc$iXl3)mgC@;@p*^YIUYSywkq?5I#i?bk3TL&=%NY8=u+7xbNLTL4?qgRT
zv5Mave`T5s!CzTw`XASBFvOc4D8)~5`k=)5{Z)bD?mXxqb<jC%AF+;;1{0@EvrL6x
zsH2vw2>ir;i}){3HPo-Z&SCZ>9yBEcOwAT=az$!MCL>ssR+dr@6~dBjbxDo5cj;)N
zD|hAfvMb}ei@5R}=63N{ruN5~p2uNsYP(7mdwQT0KgsEX66g0<2THs0yGaKrgUN`*
z!FIro$*T)3;Yo?y#DAmm+p8Ye^%&cd8f+e?JGEFKm|hAyu5?!0Vac|7;)g2-@_U3U
z4@X>ihSxtB^6if~J&!|oAM1Y=idfSFrT9rsACx%1za~)DRoI;i9i$Do29rbWfG316
zoF}gihO+sc#)GCQZNI^v*lekFRJqF1iZb(abMrEb(%}iA!s=wQ<c?;#GJJ<{<)Mfx
zlm7WDhuR-?dS2N2-x&KtCy4r!<n+Pd1i<L$_t$im2Z~@vqyw_yWXz@KHHcU;P0@t4
zL^i)O7^a!k4C%WtMAKlaaaN?1r59!9F@hPz7%yZhbPzwfayPp&^B!SW#xxS)%HfTU
zGChyO!?WP}->Cb;PY~s2$?1a<=l6Tz$4d|ib4Y>xDIn+w5u8k<c0db8+5FBb)|K;`
zQC*K=4-{But9F*BmIeuC!V{*t5lxUWA?(WNK}B475<Zx}a%7WtOwZ#Y=l@07A9;cu
zzeUdPi*kI#2X^lF*LGII1CkwO^`%1xE?AOdlr(4^PvT<<wL~_*bKHh<k$=!+YJ~#5
zjw*P;k_^&dUT#()2*MMlfM3{^5&t8ujNUwpVlz8dvhA4Ow{3#nj{Q4`MZU4^@a@z8
z==+J7U%37JXhjwL5bUTY8~s4ifeUMi>^#9xJ061HA$riH9fSq9n@NFq(1amE3=n3C
z1mOu8zpBo%F5$}XzF}9!dI*7FWNYcSi0OG8n?4XR|8Lv=ohOKRXXpIBZO8BUK;(X8
z&ZtKAg5s_MQebZe)&P<v5kcE1t?Qt6z!UQP#Q(>MzvuV|O_~v1ufcDkVg6bN*|0mc
zD6=p~Fa!K><xb(sJz0I*u1xBuD@Qa|_$6$59`W0;{x@QOv=fBA7v=Eq>3@>v_t%48
zIXqw?d2*(})WH<kFnMzj6x$(^i`1d`u6SRJ{vWOXjd;)m1>!*yY?vZ(kRbTs%7`HB
z%IL+A_Jdc(y^wI_u%?Fn7&g5x>IA*8{b5H%`!VX_q3M4#KP1Zf;?Vh#wZ(%7QVKgl
z3<>IhdBG8Ts2!2q;^+?#niTsmRbgB=sP8s*Fa=T+sdTwn9N`IOceI3G8F#{>0@;;G
z{XD^FZKn5v)B9pf&<pO5bpql27>CO~KXTsGfL|FhXykY2hIGJem?E*z4pC{59L0CT
z^hXbx_A3|EliFd#U_U)*N~%K?sdnN)6N@8-+3<tl7jb3up9BTcm2p2JTse00h3S1U
zC+NwV{s(!*TK_BW{BrYO?Uk9oN!=<l|CP+VQL*`PG(C@>e-&f>uZ-z`;1_%SublJC
zT?e!!H-FO?n*R#Z`{GW}lbrq+|8QCWD{+3g{AA~E`l9n+Nlnm;Ct&Q8JY3=fa`RBh
z&foL}=f9E~pOe6c@k5FA#S$lwt3TQKo4)A$R}$lM63qv^`0IbAj+e_%cK)U>I{%f#
z_?%SxrPlXK9WR%k?EFn%bp9)e@j0pXORevfI$kb6+4-Bk==@g_<8xB&4;~(CeX+y|
z<gTOV^Ea{FCGqn&ebM=^B*y0?njb!V$UVyXVcGoT=WqIA^Iu7f&q?u%IGo$L{#WXF
zx%?g?f72J6|4L$fPOAOe4(E0b0LDLqe10<XH+|9huO!Cj;<rA^@ZpXD#`tHD%kLrb
zH+`}Buc8d#J?{8l<N)Aq@spXq>5I>Q6?c3te(UM}Ez03s_yA!1GsxH9S!7U@mA~nW
z&VLm(f#2hf|AhtscZ;9w{7t)?|0@2UPXdZDfcLoLf5;m}>x(5`^7Z!+`I~k(|5f}J
z(D=yz(fA)fz!?AXa`{E(Z$dvhcK)VQirvnC6|V`zFXjT?#~uHZ&VVQP{_UaiH|>7@
ztJnl3#^+)m$nSZ~!;$}w6v(SE^zY96#aHnqpGF47EhL0N7F5bYDP#c@a<B;*u;^pR
z;Q4=%|0+6)r08$UeYZ;P2V;Cr?(>g>*M>Iw7JH_;#=3?BgPr}IeVx6XJ^mhlH#VvS
zBGdwzxmGz_JJitE)YIJM3-|&powNnZ6VTViwl}T~&GrrT^mX-gbp-;Qfll`F^REtn
zaEtM0hu+TSG@)M}{-+i0@xB1L`un=`bpF25xr&ME(VF4f!MgtXzJ}h$-o~D$ZrYl=
z*cR~RR~6QkHB>fvn!UbSUtLQBwnm|j3SUWuucdOXc*QeaH{LkXG}t`g>+|)t^tSY1
z>u&9C?P6P?l{QHtUgUFnj9L0`a4V1X1<2+13&l(N^VUt*zKq43+5D-(@shE!5%+NU
zP~~9NKy|;T&(l}a>+SL873P-}RhD_my_L1q^`3^B25)0+V_j2yQ+;y-w#Md0UlX>A
zX&VJAW%HF&o(b<*?MU5F{b0jD<3MA7Qy;e8<{oU_zHXl+zb38{<0g;%`Zjk5YoU~n
ze7xM}AHU_kro64YXgX%yNZyySoW77bn=_p^nLklDRy0~NTsl-X<Q^&?EYHi&D=sW6
zE-$NeS69?jdaG+awKa9#dT)JgLv2G{V|^n?V!Px#l6|0HrDVZ9TQOBN=^3vX^^VjI
z*A3MT)(_P8H}p4R14+q-wIG*R0cVx`T#OI=kb4j7p}dbQVY&MIJ@+a1y6UF(lHrtj
z%ern~bFQQ;rp;%}W=-cz<xb{}7mO8-7L63=<>eI?6c?41xJxVCRpr$co~oK^Z?%_E
z<T=*Y)z>v#PCk*ok+W8?QoL9?S3W}wt0z2THKR2nBFDM`c!?Oz4!zyVH6jyMChl1u
z>Gf0YTigvK!Yg^CqdD2}_usgma$n`{t8Qp7>(3aEnGahJ+Sig-T+69MaVB#*dn%ha
z;#H7eR9I3}3WDYCit?(8>Wb<rPc@@h<E^RnGK!ZSr&71F*7HEHXt89zY?c^SPE<`)
zk9)>EBSevLlx$WTawmGYLM{b&T#xYjIrly8RjhOD<#He4X!-p9$o&%E)LV)dRL^KG
zC!95$Fl{9rvL3XpIaZy^uBFt4w7K+|jOi@!%g=`b3rmUxg6@h6o?$i5(BrM~dc9Zd
zXHt%49LheBx36Fo1Q$x@%4T?mL=g-}JtN>K*{C*-EZB`G;*ggyJMxUb;K+iXLKJD`
zG9q`$`c1C={)ziL?)y;SYs%+U*ELsk=k%wH#}W@GZSV|}SDec!i>dQzj9)=PVPSDm
zDI@51S3rf8#IUlus=8WG;uZT@*YWhttc{!ld3!-{33dz>y2BWfJ$ob@LHruHDkv}=
zUQqgmW7`Qo=YGO{#Jzw?z~lHDQn&4u{#EY%+kbNZfGd5>z0KWM-c~&gg6H+8NrmP^
zV0gf`W?yxHp$jQ>@?pV+1x1C$#k>aHZrCwY7-9&DS8V5;CsL0<fd_N;=L-Z&7RqMJ
zX25W|a<X!QHtFtd8-&xu)o^7nz!Y@mgc<wNE$2V+Irka&3HJcQ5c^@n4<E|q^7}LQ
zJMQP)cc8%6759|4K=7*We8OqNN#jw|CK#>@48e~qm{(wkAXHdZQ69z+{E|<m98KTM
zgaUK-=dbb_q-at$6UI=ILp%62Ba>nU6qt#ebBUZ*@|ds>{F3___g(H??m0FMy#px)
zAFc?HTz>z>{S!Rk54dk|?{KdwUQpg7g4zoSXJE(2O<ReF%^Qqih@YUqk`nmA5_cI?
zSVk%o7+$fRPd=4$Jncy4p{#WfTniDTXcA-?*3{@fwj&d&kE=%}#Zu&mPDAre4@2;m
zP{sGTuW>J780;VtAvfb|kWAVZ_Ls=z_ZRM8xWDCo46T2adlNQ%7X+Uog1WP0$D~4H
zXc76rg5d+14TlkAcFY)FwOvR)<2sRcB>iwEq6oa;zWmkv)k65e;>8j{h0tNxpBnu~
zo?injxE$FOvth$lL}J<Zbid>%f_#H}9le;7+!EIZFPMjiNOE8D`TZUdgtYz<)c!#6
zqVkUNhMEXo)SXQ@tv_izX4*0xHV65U1ydA(4HuQ*g&rX^$VU?p<oTUUJ({+ad5CP7
z;s_(SOa%F80u@TOgZQ<e4_J#BLJG{l-G~KapE3#l3jBVIp2a)dU5w;!AQ^loq?|4L
zu2?R=|K|P`5#$%J;BRpsaBp$1zzg11J;MlI(w!qaW-8?Q@g5KaDT<I63@6B<$yM7$
z$64pelw-UCvkrnF;z&3_M3XtW{F)#e4|*|0Tn=p5f#&NuwIq9d;8z?);P0SKFC$06
zNp6)JfkUl;D~&pJea9)~^ZPgMxA1^J#<fU+Z*ceF1)mEM)PW#W$oTOQL{MN9f>0sj
zhX?`-<`qabOmQTfAXK>H58nO;>5n1^S#SkzBl6Mo6nMczc?5~yPjPScby)Con2m54
ziOu^E&`Z%Bl5<NVpWnYD0{;!7{iobF@ol}$y#^b;i%3iwyrzW)$rI{B{3r(V3WN<8
zGaC+BvHPm^qWx_0=@hbH^kVoZ!pD&af>2U6Zcm$FqpcvtZ|#_NCT_EMe-K+Q^14Uc
zzn%BC-+e}%3!bwc-vC`<$2EF%Nl8$(7ak(&C*<<`AMQU8fq#u?^#ktP__p3f6nPmo
z{5%L==Lu@h!xIMiQ6FZf0)qsvS})npA%ZXqW(s6cM2sW(tA)5T#)jr<K(G?_$+sY~
zm83n&r0wTA@DsIg`~Gdbw*PvJD^@CP2)b3m6?~EBxP~uEqk&f~gMXmEp$XAmyb5T`
z=l4hMpSizh-uAobM}Gj1`#Q7XJBY+Q!OI}HonJ7Dh+dEpEQS7v-&yb@3uX%Bqlg$s
z1cI!+R2iTEDxnrMpasongV>5e{7wr8tH3zg{_ViF{r(;bVdQaoBhHZzMpaN6U4uB$
zRa(Bz5kd55^em?~eWC`js$Td+v)uc)|KW(=KfnY26nEU8a3A8X@h$WOUxF9Bt$a2}
zP~=AuB&a|>ibN2+#`8l2IhhK6=tq+cQxriQ;Uh76geyhC=33#`3K1d1zh7+QAhcJ4
z);VMB51t_U51%4#%;MI?oS-zrZ{{E~JYB=XID*m-;Hi#)U?v50qMVJtq7<IC8XUn|
zF2BEm-@hP&{0e;-vS40;Z@`Ah3qGg3ArKV!Q3Mezm{%Z+BE`j}EE1zfNc_$xpG6;<
zw_sj@f(?st1O&rcorw0)UtcEt^|eeXy|COWw2OngkG4PT1krx{F_dr-g;{bqn^tE+
z&xg)YfN!K6dIlpJ-@1iD8SQwJjr1E4-O~Y_D7v{yZ(d1WVO{}lTJ!17OKd)tO0tdK
zVcR>J&BQTVS8}VXF{Li8CcQenDx(5hd8RwFEUPT56k7>4@j$76SM1%crZg{|CDYB0
zFUc-uTTymVPEk&wutl^v{BJL3Dyf0`TG3K|{4d)6h!gDm<!9(m+(W<54}uPj(V$L*
zv8E!kc|M{>DOXN+CvSl80tlAkYr+795_dRu_L)4cu%V=)uq3}Izc9Z*peT?enqjZc
zxfc|t)T;?o#=*p%B)_%I=Ce0B8j|aswN9_ACdHFdomz!WikrCqn040LZ|_L<IU8KH
zDK)9pX_aXe>E-F=8Sad-jIzv9Y$e#XwOEh4ixkkxW36*TiIL-fJNIup!S-J%9=?r7
z$cqRHIh<2%PSBZP$;2m<R7Sn)uW`5GC8@@3IRZ%%bu86jXyaU5Mq_z(iMyz@u(+V8
z01RnkS|p0$B!3H?KgXR@uB(@|QwgJn0aH(6z}%74W@)whY)!UCdxN7cxy~u_k^1+v
zb<H|u9kd1PdmO&xCTG2?*5wtx1(3v6p6*U}?;Mg#Km(G|N6uypOTaI3{BQgI?ZCGE
z{yF%3h-V<jFn-klf-bd1XE10IaTNzKq~77TxYrOxHc<4Y9a4u*aeJfSaF=c3tj$$5
z?#dE(aS7e5QirX8F$71v!btuBQR7YSCaOqnp|H`SW=1!rA2#$Gdr^F$)4V6CJ*hRR
z1tg`oiTlr44_TM3lhz??kFC?*Zf|ikB{w?jopmlj%_&tWRjHL}+nP(i>fq_@P+<m|
zlF#@>j{imOk8*<Ww`lx|_|W*(AOdR+m!h(2O?sV4WrjCQ7RP7qU}5PR#uwV*6!UR$
zGe1b7^VQT=RF##Nlogj2fnXt2xSb*+`EQ6DA9AnYF8d;$qHn17GKLenQT-rFR}jU3
z$)DJfB=V8^_q_E86u4lWK$71+TbHeaF>D6K27w|tx;!c2Z4Lg+zIiyKG}u_Q@xSo>
z!Gogz!1H+rwsi_N+>M}`!=<Y1T62Qls7X|q`SBTQ`O_HXU*?8Eum;W}17D#LnN?b9
z>#98Em5d;6QyD{^Ae0!U#XlnQeg}QN7rAHPO;2Ki!2uNQS<uXCCs8_Ogr{iiHc4?4
z_g}Ogx2{`PtaFTDzqK1G?69}lTOB?}lcSNR=&W@`Z#BMrBh=x55TcL&h4#ldf%vvK
zesv!Tq%z=q5$05tLz9$XH0Vq;KJz2oalHi_zl7|C3)~Pc=|zCf#usSjTI%XOHI-Eu
z-gIM#1O$tj3Tf<w*P%d>#_xXs1-_1Pit9KNO%hs1M%fi&sF_0XnNeg$9W;o1r2f5V
zJ!w4*1uj`<uu^*1+HdW(1#Eu%9%5+sIiN&FQEcaqe>HGINf1G-@jr3@jsqj#;K#2%
zMvT3QoRRyOg;s)1nlf2q(V6u8_`<ij_t8&$2A?>H-r@i*TFVB!Q@NJ<h8k~Gbwy=3
zLEe%h82%Z3XY!=);c3+!5X8K7<XAkUJcz~S%gD<MhS~`zQHq<m|C05Tb<29tx(XYf
z2Eh^QfVJ1!1%@3F3?o|`@tX#{=0t&CwDCWFf5ag>f8ockKEeIyJt*)9Y<Lg>se%!7
zs;%14_`(}#_f^>N8dTbk%hn+p=W{I$jo#X7Pi0kkMOk@BgSg=*9kLhG;h(vGgpz5T
z;w|nz;`!4ccoq|dwxGiOs#Wz8uS1Q<N9x~8)-%?l)(t3d**Y%}9JKZ!ngnc}%o{Qt
z#ve>qgJ8VzKiV(x;`w|Rqr$Ip&mu;w<5_SoZr#d3FkR_X{|a|ppP>)<F?`p3c>nXr
zKe~z%eto!bJvg^EHrCd8m;%cy+!*2%tQdA2vSiruf8p-!R~V`N7L@!tyx=VmL`_i?
zt3AS0xCU=X45he<`!8G1!UN(dvUM+Pcn*CT(jX(qROp8{Y=<pNG+!<7i#7fyIR;4l
zK11K)BlIrrqL*@<6$0<V?JYeLO;-}XpP}8K;Ew+R6!;ZPFWbO4^@KpMwXvzLzQzj$
zR#)Khgh;RigT_QKXvzP@{T}1@KSd1rI=+e5p!QpMYDxr8!HzLqO}SsW58hDZBlYiP
z>pANQSnxU&xME#IZ+Z&ZK8PSZA)*PXki4M;KaTNxtnokM^8-YPcLjYNh7FG(j?}}$
zWkWL*VSdEg{T7~*JPY4{Tqrf%i;K55HPzR9z121311n&|Zg;60k9_dr2?`AV0)Btb
zp0#`o1>(LJ-_}h=5Op!hj(HU-rMQXviQh?B@FD8~roef`5vqebf*#=j*)gxehk2AE
ziTeNI13bT4C=idgEAWVso>7Pdal0?P3je~!??1;k!H@nuz<2R{kl-0+$D7DSEb@{1
z_p<doBFHi8CR5-l6ganoU^n`O{}*lE&2zRkHP<)P)<J=uYNkN4;V^=%M+1oe&C&S%
zukftkQ#SU3@n?9!+t4#v@dZTolVryVDQ@EaE8uq;9&igWgcL|NJcl?E(jc?rN1j);
zn~dU}tF^hgp|K7Y?5W`u2rmd54kyU$_`kV-M+E*`<|{sd4}1?c{0h8ah~QcD;Ez89
zzYB=KCvX=+F=P$>m?iXNX2S@QC+z$G2S4(FWWi8i6|><8g6LJ37W3oxf5&>`H2(Zm
zj9NT^$9(~JJ~x;KFQPwliWBuE_3src@gono$rQMcDR7>}kqPDpQEVOr<9+vEhJCHg
zzD9~5wG=}{1@c}nq(KlQel&hhPy4=yUd-1KMc#xBzl1pQ9D3$t#TOBcrMQXviQif9
zI|d8JJT543FM5K!7o5U99Yqo%_+?-e?{5G<iooOnJwXNXJ(+MTE}`-Jf8l-u5BM3P
z{dbrG-(fcVB6>x)1cH|yg5O2(I|UC&7L3_5R#G6@FnPfc!FYfBmtkM4&xZ(tDWS}Q
z5kt_AVK3ebx-oTy`ZD16C-h)`3lB)Wp-%+`z6Be88D5YGUS}E<^(FQ173)RxU{1pW
z@)jIYAlWbx9A_H*GBAqwHxR$3`Uc*DDTeSqQ0&PB`_shl4|oFnYxcbPd+5b{13r+R
zV!jH3FM!}pMo@~IxIe^?x8QZ_ekgDiHoSoQIX;dM!FYfBmto)Qu*ZLZW&Rud+n>1q
z<o?Y47q-7}|IN1l;rQ*Z-2aog1#oxrIdu95obiu1<DY|P<j*Od@kR9Oe~)X@^=Oay
z{$DwM`(KXVz6^}w{S6z6JBS3|K~(r0+WHL?_D`Vsd)EGcNBjQBwm-?F_y4#;%_h|N
zA!5wW;4A+Q-_1WF?))Cxzk(>8k<R%i?kVMm7&-r2kp35TE&kpA4EqNE!nnr!+rJF^
zru8S4HxWnQ#~tAJ(4wC~7k`6x{{}51mj8^6c*@mZe&U$o6e9AQVE7$GnxCS_^DA)r
zdyxDsSkf8kd|%N%Kn(gJ_WmvQ{EjjH0~m)0lQsP3mjO}SzhT6BKzCO4jN%?*=?Cy>
z-@#bTk6_ck07q)KNK!7pJX@D~75$i-=v}`9hTlT0`5`F&9Qx+Z_xJ3(xu$<f`7ZYU
z0N45@<4bgTw#1RNN-V$ZuL0MRX-j)vbzSj1TKpCmeuP$imr<lv^Bn(1w*K;xTN9>~
z>li1yf&07Hai8*aoR9xjegbv>Lg0AKa990?;;Y#AefW-_gYB=GMt{dNN(v<(@~>ZZ
zd|Y3~jCH@^xb}kTY2|J7h9BU*;sdn#n`rZ=j3TxA=d$_bx*Cmx>Se_idehIKZ~ZF1
zllO7Hj}fsyh1z}uTA$&3*NnF{uP8~Op9p?~^!e+c-bG9QvIFeNnM__WAJU)HTm->e
zu;W)?$?t;Uhpf%tMeE3-W!rB`U1E=RO0{2c90aeUUvnQSd`DpT36%XkM)8{IIqiMb
z8;bX#cOv*BX5-{Dc!uOHNsC|hS9kts%7S&jaVz1p<`M|rgdM*G)x0S%{03S_%8|=2
zH?1bgpD?OkRBm84=Oqxl4Lg2K_)bWNWX+#)*Aj2(?x|l>z6~4x2CncucuC&ADSi<{
zKH^kiWy>4b&-+la{?FkrY5Nu1e8yw&5udWiM?N-k@2@2BzlmoacVQzR;aeqdwfzh|
zx@)i=ii7;#@N3gw&{a5`K4V`sZy1j0&N70}D(@&>M4$Kp&PqD`8uN&9`Q@coTiXl+
zni<tz#U{oRE@Pbk7Vc2!d~c(rUuCxZE%19b;VvxrO~reH(#g7c+a^mE8FF~YwgMHu
z2kt!M&zL{?h0s==t5|sqXXm5+kKr50+lYIADar2$o@Hzz+P#ieMmv8}QaeWV-)4UL
z$DqicJw&;yWGHjWxn$jM+SH%WoddzAlsA;O6)(W<US*c|9%Cq%Uv7Gp&1dS-jX{A2
z6h|>G$P>H^Rg((eh9wh2@Y6pJ3w{6v^8S|hkNn@rM;T&BFAY|h9zi^M6%@(ZDZ2d-
ziu(!L!<8#-d3dIH5#Q0<pvcEQ+WS++QVQT*&?~{*jpMP-U!>8r<LbBYw|`AIJNa<_
z>_6uMWdk`ADf9NdN$bWV`coiynGw9DxXV=d8Z7x8{M~Z-<z-aZn-T;1Va<$sRk^M>
zhEc66xHG<q(drlBTOY9Vz0X}U-_YODzNmgx`KIz+*ziX}#0o_iKJxSJSK^8F4BGe<
z`sNQ{6<>qR^L_x2DGK$*Ts%?Sz;*86J9-CuKW3E4(@MSG!e2IyNl8ty&hO<iY<SMG
z58u#joc&#VN8e-~jlRPlah>k|ys@;I<Yntx;vvJ)gwxvdn#<~^RL>}HDxXI*xzF1C
zmTZ2x85NEObBCc{H=$lot|<<o)#n6)x1o{yOoeX|Kf`U^J<ZFi*P+1o1usFK@dwNk
zlBJLe#g_sq>4hbqz%$tApk<0lq+2{{#OyzFLJpqzEMgWA?Y+;uE&rW}$H)5jBb@0t
z=qY2JzY&j-^qC;IfU{r2H$=TWzDMy1Jb7n%U;b#uv}?h(H|d~h({L=|Oc=pCiWd|w
z!MnUFn_pf=d2)SHo3U3ns+m=<C=a41b3!2aEWW`P&~{ScHS;ru=X7^9_f@Yk1yZ~h
zHN{&eS;)^Lnw^WOe&d+ua{@l$2E6{uXgiyIm9I40v^n^Z_MQU6oA{>q7#MMaD8Kvw
z%IDYrDllCw%K2$^Y^nycC)@Gdd;=|c8Vp~?H^hH`ot3?X!&#H5bB-13n)#4%3tsSS
zkl?fA31P?gW%J9;bUSM;t)?#hpmq`p+zT5%f;OFJ8hjS&xC=XenfMuRCEV4%q<)1d
z@IBaYC_0P&?Xzty&cIXuUaT-UD8%;XI5e3QlA<#?G&z`oHV73ShS$A{Co^~PcT2s(
z*Z(RoD`?00X?9#bXH3HE$#y*TU4vRLp=V3|GK#SNs@|fZobj|-=OV8_<^|7cFKDj7
zimxl5#oXMxihBy#{x7r4>9zVyov`3BC~#T1UvUUt@U%ei1_<7PI<A?ogWvPo7h%B<
zpul$o8y5RFym$G@j^?Lg23`kdsVqSYC*hO8k1RNcvltCdbq?mGbVDcm;PKDF+HUMP
ze)}8S*Z)R2KYc%#AMC`Ogmzqc84-uRpBre$OPs&Dr(`g93>G}^ScL*NOk1$w)4Fp(
zf;W`6;0fjO%grit)!3TMdknq05zQ28U9Q50H}S2V!BaYlB+nq0-Qlh!T{qs;-_gCm
zEcgxOTZ(rPhd*R-g#5~<=uJ>8+4*up3C(!=KZSV|hdIn&^C7qtb1CXXbAk&5X-e-X
zRJe|7O4o1Qx&C+C`Dt}*6&|D;6QIsY5FCalJjl-d6=>Y==_wt^8_k?bolRbb0uLk}
zh7BK2I1@qercyq?(iD%aF{zy_ST_y@E-BYw!+fpZ5JB)W-PGUKy}&G3P#|oWyx=zx
zJw<{$`B9S1LKsyiW(mv@KXLx4O=q&`QdBvZ;n)s4p218E@RfK)TmQT5{62gog_tX+
zOoGhm+(YOuQT5`}uw=idyR5%pBzuA^*iH&uH*La(PwIGrSD6*xcqo3=_J*WZQven`
z3I#5pA44{L1on0o7DpPqPWm%F3x0Q*1;3_zL-{uPF%*Zt9zigA^KF>CTMg6g<N4A2
z(*iD4X*L+F+Ef)Sw`qZ&oPY|YUgG@H$o0S5&R+}3r*kGXAm#&RJ7%y=L;V|2p}(fv
z-B&o0J)S<3f*4}kM+$@uA4OjfaYTDj!xNORzmn7{M_p2j$qx%2)=n@5?jsw97d#at
zcrEE^)3b=c&uj0&f?q`pAs_gjV8b5=H5jY?REMYnrp4x;HX=rrV9vZ<lW5Z08NmwJ
zco!5Z^%Cb#ZeRZ!cK#MN(JB|RR~s;4lNNKeqrW)`AAErGd%Mbei-vR2he_oXxDN_E
zY&>E(u0I_{@S*r6*I9ju9mZ~0@EB9z3T$|TdBIcg>zANGfgd8sJq^X+2ciO*4S&S$
zlfMlsj{Qo)<lR!5KSUnN%O>Nca!ERqIl-YxQ)c6hMtDN0SM2q_!ue@_a3jQCh}q;O
z1!kOP;H#>|W&5D-1#T(ri1TFf8OJ&MdHV(XMcYN&CAM9*UABg{E7o5sKh!?Zzht;;
zylr|e@mAuqiO-sEV0#AJ^(20KI_c>oFP=gk!*jKBn4fVG+a+w`d9GMP+cm4aEnfSn
zk`o$2D;VlvvPA&uP?`0{q=aNf&<){8z2dF^70%D+S15(PZJbsM8D=9O*D-=)+-Si{
z_QCYc)T1fKU8kI<lFwjG(Ang3j&qLlj3}65yJY`8?kGQ2zN>yi^NRMq?q0%O{qy=e
zhTDecjJL4eG~F~k%eH)bH=drY;hFdmJlj2qC%k9yocA2#c^=QqX(O)k=v|X|{)QE%
z;q!NMp%IHF(P*&foa%IFP<r_=KR-9t`d@Z_>LGbC9WW2;Jy1`L7IEc5-SxP1FV|l-
zQ?Qb=KWii7aQYFVm~zr}5+qLxB#9>R{11$=WA>!ttIBs&Z>V1*irN=-_jGp??k0ew
z{<a=0K{L<Qgjp9;cn-OSr{ae(yWl7|o?s+TgC)@<p1Z>j>JsOl()xXtK|*2HnM``C
zmM18!KZ*6fA$=FZbgi6D1shI99I4<5R!)@67p&y1WgpBs1cqB!4S6*6c*+T^OA9CY
zXN+g^tIt1FypLG<rs{$ERn5zq``VYl@da#mb<cxjUYgh5fq55`c#exZ8qCbt5GWoK
zIFcq$?;1a;@i{4eQtNxAj_>x2x~Gd5KyY8qf$Vjxg5((z#bZ20M)JS#)a-9@NBVvA
z%|23mRq-BU_`2#f^(!Jp?LF;1U0!-svd<R4EXWDW#8|=;abma*d)|Z=k3fw)$KBGO
z)cBlK`=!?RN*&+j9jcrtoh@1f!M(XyJ({(_7;e(4T&6{y;$QGIhgPrB?4eK4L;Fzi
zf%08};RBJP<|Qr0Uz`p0J(xc_41zOI;WB0htzmx4I#Z%Zakuz!6614H?U!2LD|LLJ
zcCc!!e7a;F1XuI-=I)0I57JuP%tIL>L((G6pZpyZ_;bV&n!WKc_jRc7eV(E64HX!w
zUWFF(GTp9PhtJlDxqHKy!!nK87)!8Zo*`NDA=vZguGud<d))CksrE~)@0B_}P~Yzv
zt(YvEDOnH*u3-i9fh@9QVu+~1bSTbWqxHW(Vg$d5d)2QgK7b{^qkId!-#1jRgCg#K
zQ)-f%?CrKL#1Wq09Ok8wC5sG6iMyshiSaq9_Dikrl{&t&zOQDuYP@{9bhdb*XbBoz
zB~M5yWR}b`+)AVQYc&6gpFi|N+~0p2_Xr=sj-f)uyI_czf<ACwR%vRrv)<lf>#%l%
zAS!|*8qY$7i%{Vz<_M83AKX2D6614H?U!2LD|LKlL$7zJdW;lUI#;|HPB3K2P$ED7
z>L2k`>{qbipI~g{Q`{YV3!d=nOoe30#4tCjB&`w_ObYD8N`wJ{;0&S(sgNw0819<>
zB*y2Y+Ap=fSL*oA#@^aN&q(D&`7~^Jo(SecgS;mUv*iE5T$F#s{EB~o0_mPRMDXLF
z3f}|6yzJt%3YRyz(cWV3KpgHxUuFdJ&?XT}<`7L55l@KWuJPl?Ph*bHNwr^UeXrE<
zosB(p17yJ!6YiNZ*l_V;;WmQIlK%>R|BgP)@9<=n?s+~#91%6h2=Xda<YpJAm#27~
z^^RtHo6T?SLL3nZP78Lt1ctlFFYfr9RQtv8mssn2rH=PEb=URR3|Egrfm5*IS<1R1
z5M0e;v4nZUzhb7|f4~F&BWC}Q0)LJtcs~jf{3f$w@`cbJ_*FaWV8N~SJy2lJHiEng
zca0y7pT-%VlWIRd{t|nAuhjA2SKsd)f(7#mWHt<f%gl;}SOS87#XQIVKp*g*5JUbB
zPq#m3HvB{Mr^yd~f>`nmvSThcyC}n*S_KPkaQL7=vf&=Yks-l~r<fhj@1Fi>{4~z^
zoK*Xz*7r&s?{Dg=?;{UbJ;p1LdNNFdB0<(CWc>aJ4@eeF3gm70C%7jL5hOd#1HZD=
zN>>dmm{(vYY?$H*PjCX!nCy5L^%r-1POAO<_(AOTy;8^fo4Xo%$$~ww;7am=WWx~z
zSuFWq%m)1v`p{&-{~#!kZ1~692<GO1Us?q`V4b7UE-J7aHXKQC7x~2<pOb1ozkZe0
z<wjrMD|NiUk1V)qTvQ<SWS9n5cq>MqkmvXB;71nx_ksd{CJ_9dV8!3!a&rnZOTn+2
zS#UFaV7tvPc)@UjyT*?nKdr$~YSi&LsrE~)@0B_p{1Abe2P6xI0@07*domFOi62Ga
zKcEluJ3)cuJ3d2ChT;fM@Y@2v@)USLX2DP(`Y~j~<OK(DM?$@s@!hjuKL0_?@j0pX
zORevfI^ORKPy{9qNWDN&fqYMf2o~@f{9nwG|4&2^@__#sRN&``!;u6@e<kT|@T(yU
zMlVKGU^l#Ah~O^jPaJ=aIzA`WeyR1nQpYoX_5I8Pj#PyyP_$thnSm7(zdz$1jy&K$
zgWqqM4<rTtjP(UY4GR1aK|Ib{#Na0Kf#}EhaX(C6P$al({G>A+$3?($TE+cD{GD>^
ze-v-jjMeuyq9_8@_28>|w1mp6P#Q9}b(o#S`JS@6%0^F<w;2^IeDzep0$Wq7SigcQ
zSZr`L#r2-XIuvAR@-?^ku;JB8Tf{5=Uyr?yS-|5l%ZJF$f1g)VQeWO!)m%eNT8IhN
z^&lqURD7*SExWFtwvM<4GWv3Q^1BPWN&;m8cV~HLWoMPY+V8<u<M;a077Itp`YU@q
z-QKR+Kz*R0v!S!Gv(eu~TSqh7d>y_HNq&!6z~eE?hsftwQdm(^?XIb;t*Z0XhgJ0u
z>w5V3x*iC~ca$e}2TY6BndFI-(e&ZWp`3xd{`|he-lE=;p3<JOZg+Qi7s^+x6wH)N
zxJN67ss=s%HGSUR+TOaJy6*b!`mTnqMr=)irhp{B$1LFSnB_w_x%EF#7p0`gT~b+A
zRqm;*siL|bbv3B!L3KSsRXysP5SriTZmP~|Hx2vEE7k@3jB_ewB5f>VG;26#D0eV#
zprF67uc$ZqWctCJm4f-=>C%bvv5L{E;p!pJV9h{HzqhZpudWvxV=2k+F$;Lm@XceC
z50S4ws!CB>>@GoN6sk#4Db)24>rsFs7ID@$AUuBusb!y0o!1`MA2RMYuUeMubICKV
z>D0-z@r<#|(d?1zk=$X&nbgg!{dp?|3&pdg(`A$8;}v6-BcM3!A&&iCY_$k$lKf}@
z^f3x}Q0MzG%7@73Csw5>1HlS+h#~5Fu!<ByT@M84pFn~46xUT3G$#|b3>&8XNqa5J
zwgtzWa~2e*(k9Z!Ge+&_P`+X#dkq8^i{?sZ7{iH*@ybz99PtoE#!-@A_yj)55AXay
zWct9J`=#EIo4*sO><TEOJ*rYrCUL3;P)U_hDuWd%c#7yEevZL}H@N4OPpd9!PV0{8
zH;wCwYhbuyTXf7P&pKyQW>Tl^=TW}mFcO~bV+2WsGiB56iE?5%RykTVQjHF&XISd*
zM+fS$3V4L3Z#`1^5c&F}x-wK31zBKOE_Sgd1<#NuQk5PrZjyhEo4F4#N%uAgUecV^
zozNdK95Nj=6T?;8l6{esXuse@;#ex6lCu|SP8W+7ilIW>`b-f+bmWO5lqkt>=L9~<
zL&85m{36o_hVPaBS#JJfr1BK2Qcym7KAXK*lOn{h1~)0c!Jxu7Ftzn1#VtngobFV@
zQNyM&#BjyFi1GoaP`=`DCKX@U3xZ2Ui^cOyg&~F$m1B}Eh&F*w@}STMqNNWE?UnxN
zVfh8CQt$+M6^1JGRO2S)ceo{?2_+BU1#j{MwWkw~>9>rVrj5h{(BVGl@M7{Al&?VX
zfGi{|4ibbNGZh9IO13~8K#rWiCwZ9o0si*+k;{j`e(5HH|9|M;(2a%o6B6_C8J$B_
zDL}B4a!gQt9!i<OsvKdM^`GDYzmK87w_(H2V*>S4s>_=5+B3|Kw~U8P>tMLgb}9L6
z%1Kr{U;{S1mPZd=mI`UuXCBdnEP0x2S!`#_fB6CAM=szIKK<ws%7;7@Ki-0o&zV)F
z;Qb&Y2o*AdU<iKyg5l{OVMzJ|M3I*iD088FN=*dM!j5?rf+6v9iRCLOZT7w}f_yY#
z3}bQ=cf_8+Cw?OCzdXNa=>y{)7ThUUe=L<T^MHI!0Hk!Hv>a4pi!><OaRr9Cs8Gmf
zSY$;>@?Yg%RNO{#ZIR&VgyV=NP@(OT<D8Qfpvgc171%HcvcX1PgNs5mnF;=(v3`g*
zfiKoEVegC6w;r*4h;03d{HUq|UZJ`uB0<59!S4@<AisbGQ<(-789?IbTgqn`!3)}R
zm{xsKe^h^j_#uK&@_S0%zRt>5>}3R(nH3AsB&?~?e~dqYFZwZI@ACX&r4J1IV2mFh
zQh(bN$f{C=2ttJz=Hd&5P|Cayp+F>gQlQKQBX~u_6Fj9qZo7>1?PtQuSFi#q6i2X$
z1ktC7@n?tL4h_IRQUMR*_ZaB|A5wqJ1MaL!5hCaYKPnXRD~u!XiP}(P05<$QmeY|2
z89{i$(^ld~<rjnkG?^4dsB{4F3lUtz%=-8XfTb4jhDb;K$>_tfwm1ujbv;;y6K4VN
zL+LLVgM&6KRs;wV1V1ViLKC&Vk0~;wz}I2J_Yg;(RbCeeB9?&P1;#H_zJf2lz(?ZM
z{1x7c(dhUKfTb4jMx_>d)1^}ugJ8@B#HgMUcLDH2^CJu96&P8S0tBf-6cXY55&XnN
zZQp<oq;#Js_M>=CAb3@CQG1@}hX}$7&`=RBA&Q8R7zA-w!Y|;6w*Xjj0dI&^w4cg3
zn-VjdV=f>rqUtGe768k>|78IL_ABrUQy@|-vKJ%B`28~?2tU>GTUZeD9-_!A=*gg5
z68bWvK@bGLi<nrA0zPEHX<JmnhmRus{Rp#S5ER-RdjYWI0^VrULhoVuFlW?bLU@b?
z#F$>lVcvbr1;BFoiFHwkUkn8j#4^TDEEGZ({1H<ir5SsX#Swv^zz-3GFF+$GP$-}x
z#1UvP*wR=FfF&332G0=n2ey2eOE7aW77(-Pg&bB!#8?3A{7~@=JP!IjqGl)o7N0ta
zQeXwsQfc17U4?X7tN)R^&{Zif{nh_QG&<S>UC9N!5og%(3qSqMtkl{-Fv<dAHocHi
z2*g+boPQdLfWC%U`BPXNrK=JXV2SCfgalZLn@EWuoz_ZrqAabv^jH7At-%=>E{<FP
z96tURbph|TGi>{npZ=D@nRFTl2<})wEKDz?C>C`AaM@n#dEIT*>sU4=rUIgLN<@-R
zfF&fXvW=<bq_z5kT>a@EDCMNT`t!Eta#(s1xd3?k_}}*ZJ5CV!MsWEsXE3UrAh>M-
zad3KJj0M1zbFNL(MeR-1eZ^ZyjPWsgz2C?96D1$uliBd;s(zQ;m9VOmPHXi$x%><-
zsozydr@#7RWNX=PkqdNpj{imOk8*<Ww}Q)uxkPn>S?vPBhy}#K<-Z(C_e5C$?3u{e
zXFYDXq`jefLHQcSOemcVr3m=}C{h9fApw@F)i|YIS4gF``n6ns#=Dv~lp)Q>N`Lil
z;cX569KJv|+W24i{@_8;e+Z2~+hD^wi;8M+g)JZsn_fsZykh~dcQj`)dBc2Ce_8X4
z>UrgT1)tc0l8EuiFodKrDUFGP+GW+2B7Rz{&*kzn-PXRMdPgCZ{_6M9Ta6xtxBxi%
z_+My$j1!1&i{sB&KFpbw27}hBO2q=f-~wXWFHA3_sz*B(0M`!Z&!+6NY#GlaT-IDy
z-B#XHh{>BMiP*QHL|RDy<FvY@o`fm&er5c$RzH=?FY!6uef1m4_Z9KeU;VS#&K>`X
z3v^?R|B3r|92ofqzdTh~KFlR*^d_yH5fl~>6F;i6z?T1Vs5!E20dU=5(Ny}fecgP_
za8`Fk^R()g@&%|+Ox{E(F<1hubg#v47}YGQHWcyFT76G8zgr3SG}7s>ejC}^h~M}H
zy0ORqgay12$L#oJ+wx)9aDrLmP*Qy=wtzTd`7ejEQQH;(*Y}r<XD&F`EQd`e^ye5s
zB+gO3sGu}#Z{zM&NX(L6ZEG_P=w{S=m79v#X|293o8PngyRhIlmG8lZ#pHH7(_j5&
zM}x!Ppar_|#{bv?-tYr<{!HU9Vata(v&yJXNYW%Tf>=PjZF(V9-HKQM+|XAxmNT2O
zVn3L)Wjv)nue%BjB83?2_`X6+-UNmjRd!!ukA6(Es6L=Ps)&`=>Kk(TnOXWPHPT-x
zKTz<Vf>IA{Pk;4yJ6jy_P9peG1dg+SH{$a<ej$Eh`ERy-7&dG$>MUxfGMyGm@0eb=
zZ2@p&uX`kKI&Co-OP&*t8crvi*Iv~;rM>}A_ySZ&DSqC<2*7(t%hY5J7>1$1cxkP^
zDx2R8!yVm=npdH~c<HbHW=Dg!zp)GBe^kCYxO|v1D@;0gK`T!%q)%abA%~({VGDp8
zdn<<XCo|?<E4G74uwf9qzz9C0LgE~z!dGF*Z)H>@Hzai!k=9DHpk7lRLVCn#X|3LU
z2!6rzSBN5fk~}`*QzAey{greAy6rr_;NP2iDu)UuvSw449c$JN^Oo_X{)|o}h{QQc
zB*20SA7qw0>n&}jUj3++rL|ISDWavddQ&#PXN=DwebasQYfOP3zzar8fAxzm7{^0j
zuBp3fuxKm?7VKPw0uPyv!V8|&T@*BU3!aed_&%kzvLdaOVNf>-1+uhOh$Gw6TFFi{
zCZ@mAu=H1mBGJ=d{p_I|^aUT#)LlJLGMYP;G3SB;EgR-d({b4FS=}W@@Ve?I%KO5O
zk(SA8^O=KbtyKGA!#mSjy)0LMNZ)iz|2#ZkF#Q#?;qB?KwkHt#g6T+KuDRROUpkyO
znK_fXl)P#?kaQ?<%XmV6I^i4$UPC0gj%a*a`8+M_wKXN}fd!A~rZn@2A_viv*`C(w
z1^N7tz6t5Cm<1#K6`}~)Fr`rpC2#r>;_;XK)!gOjD;p{p&z??OaIV<bEbHdOuwn9o
zA%ZuQ&t)O4m7S%vf(5g*R)``<YXvVDnHWrFqMcCsD_Af~e}yQrGyPSx1Y%z*qRm}3
zz3!pHv7D*&xs+uGDR3ik({#*miU?{&4c?$-(T-qREA13~AktdFhL7NWB$U?b=0oW3
zStOto(qE~B^jEOq$n;mz2{@lHO<$L{w|uB*1RiiEbup+w*f0p5)t%Q~=B+pjX|0m$
zEG>zCW3PS~y%;1`QzNYvY&blv)pgnYSo$lLzUjVNNPmSW!rO2t{nhuLFdoUv`vTsc
z^1<SfyveNTw0Xo3C=gMEw_%DS0zu;Eth4!&I!xWLV5GHz0)@0z@PcAmtEV1@pYb_F
zkbByfm<5XpBpVK;zxq_N0Z;e?Rr!UOTDtQk7D5;Ih;>h-{^bIi!o+-AUUHr*FC{N6
zFCAM3+cNVq^Rj}Q#cR)ZvHcmr_p|fFEhkUho-iKC%hwgU^NWKO<fD<~2}*Gj_v2n~
zQ!%FQN@z7UCe|f+E!DOvdxfJSx!mb?mbuDON>fTxOHxZLE0%6+liizK<*abIQ_50H
z(@N5c)3Id~(Uw`rZ;}mo!XIcXt1K#`>gQBJKBD%ysJw^dheGqP=2hjCdQjVw;5W3H
ze2GowhNOB+t<`I*v3cy(j;iD;%Q4HWrQh0tGV~3O+T<E%wX4!qky4&gp6X65ODjt&
zO)E_=NiTWAcqA|1ROu-#FD@%W{d1~QOAOf-Ui+K~iu#iJ7f+p7p?Pjyv8<X>k7@>V
zJ^Fy5!`NnOHTld<NsUPjNp+Sw%W2D+Wy&&W4OsVpVUxWc6unLlI99nTQ!22Pr?^wy
zk_~vmA87V?%d4o~B@q;>pNA;2+ULYlikr9}v)6<|^W2tVzj9GEqaM=?>-u%Q`YuDK
zVUMXD99t57mUEUvmSxMN1qI}-owjyc3m7&!>K%2USmX4#h$BcoVLXzT_jzk8tKAhu
zaC-&$u<Gae`63^wfAJhXSZE&UK{ph8m5b^b^@L_rJ19^L82rW#%X!NYC~(0tVL<_T
zED!FmwcA_l&GsfogFumSlx)Bg{y<A@eRWL*>Yp=$sQD|_osOt}ULc>}wnFoWCI^(O
z$_4eTdQvl{8PV|+yDb+j$1UrKB6Ex&3dlo+9X7t`G%1m%So?(WNM63BuEFE2tfuPS
z0>P*i<O`&9C+^4kk8OqKxuc48#hP-3F`UwjLy5z>LGU|iISd6ZS!OKb&>)pt4OsoQ
zJpw~gA}A(d5ZH?Zr!AP}*d}c4n2aKBMxRSZxgeX_Cw6{D3cxJRJBY0pxl*H@U)PhA
zL>XfnV_c!7p`ix#&nr;>oUeCT%qrFf>rNx4U@MT%FG`_#%&$A7K%sf%vT7bWoYG8a
z#w?dCrz~5RgO*j;@H7Yt1>~(LARo?<rk>E_#5$n`#4$vvjpL3fi*3q6Ota)a;RDC;
z&q(oW#lzkvu<O8C={pENmUv?JWV=R_flQztc7-O4_KTmQzk}6}_fYj|R=t}vh-#r?
z_47~#c~M_d|6=_|)I#%!ChLeNt4b6}Cx+m6#&XoM0R=8w<^_U$`SSp?WGa5%YWGq5
z705VRfX89ff(Ep<8Ep_-f!T4#xIClY$>pPsRZt3jrcwA*ixfU6uL)-_!I72G&o3kh
z)U%wObcGr`53U!k(2V-=O|^AYyOt`~Qq4=A;I``Lg@sbw#Qj*m5TnpMheGpE;Tq4-
za@leg9`LZ`fMqXicn*CT(x6a49xC)hhhQiIuwn{XA(FU?f4|5GGoN0@?8n>$vsp<~
z2a4HeJd7i?QtCAb?UitPFpP2jumpjWIW`-AMJb*KSA!#cSFC<KYS((G{yAT_maiDf
zYcLp3L_Sjgf?v!+^H9nrQ{g`43izF~oPY(dLxC%nMf9epEMtryUwE2S2yYl_e<B`T
z(_ddE{Pnd=X&52=5J}DZxg4#{q_eQk6o5!M^h9lxdiBC`Q9uMAi*f#K2?C9nf{>1<
z=|s_uYf$}eRR5*Qp-h4Bf@N+jZ^NrNSV0~=Ln&_JezyK2PN8|&F;gMqcM=wSNKoKB
z;>aW;7%U)9DhzLb_}^ZRR(psKtrfe6RO)^%N8>OU6V*0YS{AIi2rVoRKRoKse)zpg
zXb+i+k(oHeCV2iG2?CW!Vr*yE$OA>8{y8j|DX;=@I7CqNgixW#N9tej`&Z0;i&JPG
zcD$+B0KfByAjd45Oo6LV;M@*^-PW$iM(dG4I0dcEV|3h5B4#*|g<V0a`xY)o?MyJ6
zbY>;BFO#Vx^7x&<MXvvia{j0Z0@Yw>LptaTksmA=3an(+yCVtmp0HSoo48+C{}H#)
zJlV133izFd2i!soAs<LKJcl?E(jc?r?JZ8gB&cMLrdJR{+#Y|<Ecg!2IgY@Qqju>m
z21BAI30fkCQZJg{QU^=S#^?3;aFp|t1;2(kyunSN+f)fB>%cr1g%%90T(EvT#Srp=
zV(n?silM<$EY=eFNd3#!f5<2_4}KRAfluHr<S=?MYv{);p(isNMvxVd-`Rd0Y|Rc8
zW}qSY#P2iKzN_4R1k+kBN0p+r>P=9g$^z0-FX9)u{&(B?#ra)P69nprUt=SAK#C!}
z0;3Qtk<U+Np?QHHdB9Dkz<o@C^DK@`Fh9r&$VY2G|IIt0Lg<Tc-#aL4a|$-x4VRds
zO4ZsE%tn2p#;k~TeE56O*8grhKY75I2?B-s@vQziESOgy-Hp(yz>o$@q;x0l$Bc7X
zh2|N*v$zL21`9sK6o@Pdf)|{^Jsrhj5Daf?=w~f_y#;2JLj0(GUqd;ft0<Q;jW|-x
zWvfy(j)WwG(WDC<6aA;?>whE8{|l@p{U&Cy+(F@pO_W?1f)#t=I<gVA&5`xbQN5dd
z;Ew9uB@e?-R-t+DyNDj(DR@A#;C1*wQXttddBGsT=&h!^5(7M;o%qoV=#MeS<R+d3
z?&C(8kIGi2X_9poJ@Wm>e}&H<3Sa*lJU>4_DQ1E|q5e75kLN9zSD<LaVt@K!_{l0X
z&-k4~1UV73;59@MQXttd5gaEC#%{d^{vN#!e*W4gm}z$p3OoWE9)wG);6PCA)JoKE
z5X(12TmKt8zgS=(YJ$MxvuPKcmy$0#F59orW~1$@xLL2V4avUjE3mZF=<}Y3ZZ6{P
z>k@h({B{K_f*bdu@+s|C^w$j6jMq%p64_?HCT>aB#LaR|-WHo5O{Yb~<mX>~7xT7X
zMLDN)C>u3{!GLCT{j!zmm{BkBBK^_)t7z+gh4cRs5$gL;_&e|sH!x#m3#EG|Q9h-e
zCZ^V`6mDc6$vBaEGUc@Etn-}nT=Mzk3&|H87ae@zk4q^2fz5@o9*B~MU~R|YwN4>+
zo&`<fc^;9JHsX2__Vgk5K>3pTuI9G(Io+*<XZ6qOZy0VEo-saSyl%X1qU~wZ)AHz9
zuElS@n&<Z!#%w>rDzUrhWgJJ@s0ozrX{1gG=I}|pcz&_g|FZK_g!&<7MNmeEJKR@L
zHfjT<dnR~-_0wf51qX6AvbHjgrXNqGvK6ON&bZDvg~A`n=dgijT9X}RJz!s$p#Xoi
z1*<y_+ar>v!IEe)p16BMp%w03#T&|3RQJ{QG<UVnYwzf8>z+%vm2gXc6Wg-}Y`en`
zaQFm>KY)jSSJ1~{*zgGANIh=#vL*0AL=x+NB@zTSj#ti=tQ4%}t!E#~+RQkTel+cP
z8mW=CQ?4`6BqN#bP42LvtOxx23K$+lpM$4(OyJ0qJdNnf3a!AN-T}kc7{wPg#1SNQ
zM3PuOw_E(A#^<E?Nv-ddI=*SdGv!_=CW265_Myzp%p)ina5U{$kRmDabVilaXJ=(S
z5FeM&+g(G%T!%exLW@VBMxNsdj9&!{t#I!v-d4P+e4u<)^)j^hlIBIt3)&ZmqxSjT
z;wLpeC)Iwb^}SNZHxGIzDrZX<L2zH*{+xrbWRy_J+{$3ZKSC5UD_jkZJvLU>1NW&5
zu;jf0LsDXpBFn!KEVKd@zNetV0<SAyQ@s+Tcu#Y0*Z4_{&q=jkYJIQN@xFoDv8t)^
zxzfd=)q=hG`%yv#mW;KyjG<7x0$R*+r_?6<?434N)&u>Ed8lwjU`W<{h^NT%M?@4_
z;XXiVMxNn=5XD{NCow)J)qbh<y;8^f`sznKlNB>%^CcmI6i<jDtQj7W6#&U9ORaH+
zmGuC@IrP=Yl6f5-gf)XA8$Z}qXay>KUtstKtQmz?RIltBKZ)@<srE~)@0B{frMF?I
zh7>qkx=_3X4X);sCp?gIFgt=_R%u$btKLCH6@;=L=;6*ng^N((D()S~AHpLB@87l+
zT7e4RQ@o>Kg;rqA4|a{8#Q2<4`=!?RN*xb=gWl0<QXp(tAh?ea6fK!qb2cmML4{>t
z!$MgP5S$TINR~_t_p|lqQ46gIDijK>>>59b@j1!%WAqB;ePgcgl{~(uaiDg@Gf_EJ
z&TN<nBKH9A2}y^73Q<;}BE{=$WC{$H^}uKy5nO-@mzg)*2Y!FT9E2E!R-i(%WH8({
zelZJp@S`YF?U!2LD|tNl)eXafCn{##vt^X{eyK2$;6d~Z4~3WYuv0Q;p{xf8P78Lt
z1cs~N7q`#~Q{j7xcXy4S#Q2<K`|<pZmG_OYzE|@2?xy~_p_<WZUV+q;5ebT(kW{#y
zU7S&#TJ5S!ZgQ})9_YzL5)@R($tbi!D%?eWamVMR+Ap=fSL*oI?xw!_LGOqM#RH%~
zc)=1PxXd)TnlF0711L+u%6d3iSr1m4Cs@`48st6Uyk!CWWE5KAzPgM2;*QTrwVy^0
z;;rwMI=;25skfdypr}CJ3x){p&7(fy0alixG9|362W&W4)&t|6Q@kC6pR7VFyT(sq
zd`_zU#4q0ZUa8|-yPA6&`r!d7hD=mWQxPQeWWqJb$C9E<R@MU++(?CGNP+0d1j~Ae
z1Q|aWg;sWzALZJ5!~z~t>w9JJ>uH1sgasppR8Ara^F5hxg7Ac)vL3KtR#X8|gfDI+
zl=a{Vaxx38>>|Io<8xB&ms;N|b$n}}nfQ?hBn##hDB3XYiUmKoO8ih5N+|0gD6kVY
z99Gr?BgT^gKgmKXyUH)l_?%SxrPlXK9p4)8Q3NIr$XhV4K)xqK8eE~0g!#k|Wj#<B
z3S~WD!9fMWhQrEwSjHJY8HHAMmEX_tgy%c3VC4M881yBKp&r0<o-vF=w@bC3#{c;B
z=h4^qN*&Mm^`QU=c|fvYQGt9<1_X;3LE@L;POD7uxN2d+vC4XkfuF2GE4#+eCDp?C
z`-|i6mfS11{>SvSp0&VC=XmF^f6(9W@ALQid;H!0F17{yv~|)pc&&J-s@of=M?H!r
zKh>e2Dipqsmf*(M33=gK&rqPdGtfy#`TZTSHpvE5agxs{<DFxK&3uXXiTL~E=3l#!
zzFIt4F<d=>@(3u5zzR$RnmU_0oBg!;#7*DnY|m~hY%6UmZ>wsnZmVhYw$--Pwbf&5
zz}CpNrnWiHNaH|rudk=2yQQnOt2NNtiA~Iw9Qrr@Uyr?mxP4Gcek$e%u3@MjPdOg>
zC70i8n$t;ZuKCQ#+_8e;;=$5>cV9(cWp7olr^nM>1HpL%wVkz{b^1<gTS`l2OI}N1
zOL0qCOL=QWYgKD?n+F@Q<Z0HmEtE}oMrsG^2O9gEdYe$z#)l0oyBW(6P05B$<5^K2
zrdU4W>mH^pG(N@`Ah-U<yNai@#}fD3mYlO`lbI7aqj|#xgGB?y{iS_neeS;U-ilt(
z?5WZRlG+?CsV$kl9ACbV7?!k@`O2|X_^Mi}TB=)#V@=y)$#liIXT&>PH&EZ-(BIhC
z)YH_>ZzvHIVkwDb4g#vjRG~+Fy~aI*P*e8EUw~ZweaPKbo!1>P9k8t07o0PxQ|S{~
z<Jlv5L-~UR1BKvN(pQSjt?x?QV{LV|r1~;_*}mLnF!U9HVJRrO8O2IprLW4jR6J8Y
zSp|YM!?lBTDD=T7LWv?pfu$st1#Sw@+T57T@kp;X@X+%Z0(IUa9nB&CZilS=?cYEG
z&abG?By1TEnD<(jP(p!9c}!+ZWR0O5%23{5{s3_-0>2JRo1@j$lIF|sWkH2`&H2qd
zLq@TMr?^x!S2kTSQ8fmFBi^C90b*FkD>0m+WV4nqd$%7ZSbzyJkMMd64?QpAac&Fc
z>3oEjTz=og6u4)V7f}*r69h3;1qD%Nozp2`m_Cs?o`te9tUQmt+tiWNZfi|$VFa7A
zm<scn3z`d?i+p6wP$IVFqWRJpDpXNDN|oolf(myiQL;%(99i%lM3F4yWqIh=+nBcS
z45k!}VB!{LBR%v>F27Ikc>j*#n(CbPM8aX?K@>z;wW2VOV}=-}f?>u)<`^g%x=ns_
zyA^ARoGmG&K}Il#ESVTK6*U(#pUC*Rrz(O3Yes5=D&#4$$RgRGWw@^it_KP%#e_D=
z`9j;z_cjs)-Npj?WrU&fM{ojUy`j1b{BNP&L+JN!WBSKE#SP^p^%?E4ghNEo%*yiE
zXOnq`Oo<tW9;4s9$08_@CrF+U1eqlZ41LQ*3sk6rFO|Yd%`g>;cFbEdeB$=@?)ZHf
zvkj)W0o-ZSz=mBk6#p=<cQA451x!3TgfF25Gg@sAb1*(Dm*00#)aX9<obsCLyyled
zC`v;eH0?L<g9;aIa}I$atQq_|6Jf!mz+|6G&|qeBHrX*_SO6UsG%cfC%52$;dkQx^
z6BI}IO^hZyKm1>^ImB-U<`;kh%Q>1`OVe;=zutjey~JJTPC|kG2;_MW=|<)9<0m-X
zQCx=t&+-HfAegw1ibJ7H*(?}N!<HF8Kk>7nALE1<Ol4M_*+c|Mg}Ke-4T;|z75QKa
zWN}0w7#2<B@uLVr_quItR%0e4^biE!#ZB|e+%4_`CIwA#oizCp^9tp^<nsFzQ#@aX
z(r-e67huCDbX)ojMv&}yg;dBG^88@IZK47Ng1(HVtS}W)i2y_pC~%S#2o170!ZQqu
zCXy{#M(~`2@zFvUDu!18FKERyaG9@nF?s)0Osl$zUd#$NjMWU~h{Uv-LEcL~zpq2-
zFLJk`z{{$0n$z0jJVE1r69`f?fePpBh8{y8(QocZB7RU{a<fa+V2~iGFvyR3F+4#&
z5^q!C|JUA|xVLqk2i_Mu0fGPtZUA=@;J&XS36PSwXr*?E+C^%$Bx<o1%aUcwmb}Q0
zcgI_tCeG3(Q)iMk>3nUzX{XO*rk$oUNxvlRG-;dmZ~1=jxws%eg1C?<#}Xsl=aEQW
z+yk87ec$t*d(OL<_F@Kp2N0dD_&$%Y6yHMFLJ$$k@~K$(eS}=Ri1X?x27>i?lQ-Ri
zA<CAYIM|5;;Tx7$thcPshz>(=PGm6mBm|F%H9-o6pLa|j0U^jdrbLZ`N{fY{(&Or=
zlm}22WL^S0b{cjhNeZ*zCl+`bzuk?55)Gk%jm%{5OwNv=@cS6Kc!eOQ^GM)!8tEl2
zCYYrm%AVhM3Dx_6JN2sN4ib3X@s#tj>tfDXgJ2#6kwW-Q!cT1QNL&J=rO5<`841y2
z;ioK!jDdy(b|mbGA%*7gzcUQ{2N9f|Xz&CQ*pC86P{AUU&}m8^r$Mp6SVlDXG7>10
z@ewAIb=(4v`Q=&L-yf1*?i)zDXz<G-fl7xjyDlUV6g?Jxxr(1?FcE-AprONY85Dv;
zIzO=>#w<o31$iDxIhc`jLS~vV?jV|a7+S_LgSgBv^_}A1V;rK%i_e=dXe+WbBe}a(
z`-c{{&|P2)GcxQ1>;Wu&>L(o0X0nw(aR62Yp|m&R5_kh0zKR_=pCBkBVS=B`VbmDN
zJSJ8W9|h+L>im=ifuNd4$L&ZeLAvjP;3PAYJ@i2+9*XxvXEjAA>jrghZ`$)F{Pt=k
zb~Ju1Q3<=$T{@8Z2{TNIFv&5*a!2AD$afHu9NF{xV_XNs*1x5_W_d|TAUez_h$YTB
z6W8JsUihtJ1~}#$^D7OGOJKA#m0(<d<0D{v9-YRHbgS-5g^mI2>n14fq8B<T>FGhF
zZUX_4eLTTUq&;uK?+%RxEK15pF&<l>Bpef;@G>8O6L20zWr*p`ChXKsq*dOrpW{Hb
z&u`*j{B6dH_X!htow9hh83S*i!;FGri4z13elh~0!6Jb&lNKG0kAk`kQrd&@FTqcr
zN9z)Z9hslY#Jg`{g{qYq@*4WM8H&=&J*pSS_`i?AzIuuCCj8#Tp?+NE%s$+Rx{>8E
z?#I;pC&=o;79nWGD7SDIN703i{An8;`3~9h`wYkWud|Bqj?!R}z?-(~N{6u{SYllU
z6+dP$ir;FnAxNOiV?>9O2<rR{3zC>eCkPs&U{d#sunV0qT#ckjAGcyGcQWqngK9Mk
zc<Ik4HIVMd4+*<D3YL7$>VZ%rPl3ZqmN#&p_=JymI+ozrYFVY0oZPM4ttk2~6nRHe
ztYmG$xPQVonYYf%k-J_$;E-}$s2FZ;?5-LJK>-h#)dgz`*5t4CkNL;_<Gyj<L>`Bi
zBlW7Ke~b0h4F+RHv?Hw~wJ_;bNA4uYy4-asea>fQ(o7Erk%BQs^^MAQ?1iVk{+I52
zN&}03`3m}U6>;tWjy;EfrW&No_=HJqgFM~cz;r}G$74+nnqG&6my{Or9VWX5Yled(
zMWX?|AB<ah;~YYRe&~ImAM3wl+wWf!?5S>vM4D>aDmu%%gT3s;4HT~`8Y~<N40A*a
zV)?P9iKU*|RNviN-&NIHHc&c<S79^|D;UWi&0p<b?O)@QW36wkNb!Om`;ZYyg&^O5
z1Syzc3{PJFTYP>&0}FrsDkIYK+6it(GX$M3^jQA{VHnfaNcpfA39OSv?rK@yvGKNY
z^&NVcYRCEp>xZhMp;*a?E>%c_$b)dyk2uYbv~$j#dE<e>U{^&;O=CFRP}5Y^Qqfl4
z5$a@5v!|rDq>r89{!~g!|F%66ZR}`iXs_+8>?!LF_H!o&*%jqJ6mTE76GC&;pU8iZ
z=(iUMs9`kkK#zps!u7v}=MxvE|KSf9k?v}jFb5;dn)5Z6gbo&is(~0V2+7lltZ#{#
zl3MfX*;w9D#oI|~F)-9JR5w&PTox@IDIOu?#b`j+DWSMlILg0yV_(;9SdY54=T78D
zi~53{<*ijsHTBe_u5GMts%)ufEpH1!GSsy+vD7oW!(;XRjqS~iZFL>hofX|>y`jER
zp(ygKB)UM6Ow(%#v#&$~T6oszSF`8T^}p2fX&OlW)7N<Vc!Ot;6AX5%)apx)gOM<R
z6xMN<+Ku(CISA{k3k5O~x<DP`J~j*vwGY(~)eKig%c4PD3ZWRlZZa~h5n0rw@msP=
zYd_%Hnmg$q4Gb3d1Ut*xDx0etY9mxuu5GAptWq3PDJ}imba;IvTHn*yMoslddu>Nm
zS4DSuPpCI&P>gd-?rY*_x9VRlqkS9v7O(#$pHEzr{s&pVdPlp>>cK&DnBZZ+xa~GK
zf>w%*MdbW@^!@65XeS@egN+SkCSAhNaOY6dP#A(0(X!zn7P5G>NHVXC1V&``p(w)f
zJFHPWWj*fL>)Mh#=^G_=Uw=tYu%oQ4qPeoMx}h#oR~xRauW4ACSn8R5;Z5Pyk%5M;
z##SnVN80MzYlLA}c~4nSP$)7Qh%~0}HAA^1LZE+&-{SSZ`SVLOVEp-OYEJtK8hR3A
zu5SYfL2QZeDU<cB_j$V0Lw`48M+oi2%%}w(K4o~gd$?t&ez=ZNPzYjQS+z@Is3_*I
z`A^)XXDxHq8OI*il!r|=e=I=azLM@>N2s;DrLw82p++)Urczq^xBcNQ;c+Ceub~ry
zO)aExL<*}rDurQpSx*RxFkJfiR}vMJ@SH2_SLxROjPnMlCH@yaA27l_$Lhgh%y|@%
zu0#esc880Rut3%qK19P`V4wH|Q_Xdl6+K%5DG;c}FdXd}ZX0SG4iD7|!I&Z_QaEBT
zEF39-;b+{1JKAN-N&7*^F4txj3D){Y3Wfp$q?_v!hGngl&6N#ROA|{yb0EAeye={t
zSykWD(9zgR<v0ko)ru6RFkGVB@@KJ?MT%eg^*{Z5h6d!6&gV@u_$m@O4WCt5;tC;X
zbGj@ZeSPa~0#|Pke13!}=Qx&F0uzf7P@5Kw_D0)>nw13BBnSp|DTE<U4`NkDK4YA{
zt6fI|kJ|S+cjRnV8%3l3;lQc@nrtv+|0BEOKzK)ZLwGIQA^r8;5G3PFOG9g<wXVIk
zz1m>dRhBscj--0XwEidOO<p>mcNCv<iq9B^y^$%3U(2;Qow7djF>}CI)%wgy9@(bQ
za#?1S@cle38tWVGM1vdHAFE|$sZwMx8XAV6NTJA~Jb{h=XXN~Ko@Fjsp0FKZ6pRzh
zTkVe`h5e$*ieYy2JnIjJcQOKwM@Axp=x|qKyH2nLf+B@5tW<Jn(tRiVGOhoau3+hW
zjP<LN3<aB+7<V%{DN_XX^{p?{_d5h<8==!9%w?J(SPH-X;jU=gP!oIHb?ERsK}JGd
z3S~50{eAAkTS)meB=ETXkYn1l9T{Bj9rrR4>QY#QCYNN-Z&!FKJQ-fY7&uVhD+DF$
zQ47gfTf<0UjjqY&`6nO9`k%pPj@z*d9hRz&Qc=pU)(1afgnSbTy~#ZB6n1($?})-!
zQb@!`28O$%ZNtq-U?f^QRAUf~g~XN^QW*O_tEcZODZglW!a8f;@7V3y7AMF^h!hS6
z`UCyN+4I{S-WpyX9*e|~z<x%-&QyX*3Tw>ynF#3TeOmbS;M>zG5~%AiE9@1b#fl($
ztfWx(FW*<Az*WmR>q*-Y`#$W5BA7ejUF#e5N0k(2%kNNl4;Ex&cp|bI30#E^%P80;
zGRS<UUPeMCg`ao-m)m}9bQKn)6Mnh`*2yTS<}!+)J`y5@vVW<E|39sX1Rh0)_d0gE
zwu%f2K}NzDQkXrz>F{<&K+)h>WC#gFhm#315~9a#%kAIieZAs0+=&LaFpEI~bsbJ2
zD0*z{U%te%&SjB6>x}(?V_FEhH$jm33?t!)tlwvK912fkfv1!PkLnVL4yO{7k?`~G
z|8m=pjSgagyU^eko<n2|tV>H!53qQLIm|8Q>K84it;g&~7zOt@cjyFVM#!8nJ9?h=
zhr)ZYz+0HZOvEK{puUfp45Q$@7Nf_@?ce8p{fOe%9qovT28#q@MU)OJORSHBjD&x}
z{-wCLzsgMfI<uHLN-v<pjDmYy+Y<!GWc@y?<4|~Cco+OOqQPsK#YB}A>17m55LDLq
z^X~t0+lSvsKNh4z*I<!ArNc4`CKJ^6f5pzf$Sm+FlIotZ9=FZ#G%_ugI8Ja~*8C1D
ze%lxU*D(uRt;Rr+Kp6#-2`;yPpZE16qeG+cV+3rEi3T@{cTYlx=L!BP`<JqM|0X-l
zx0p#Hfh03MMiGz$l(N7Q#|dU9eV+BgZ#NcXDz3pJeC(ji1Z5@z!8TosANu_FiXYFw
zU82EaLktPjby&9}XtC`7eu@Qtn-TB@o&~Qc2|U3}@Q{px5ZtN=-VeV?W`Vi{t`a*e
zqo7z~Mey^MpXIjyJhQs5v$OVH_Sd9P==UkO`UAr8{)CiBe@faXIex_P7g-*)+67iK
zKBCQU#mk_4rds{JYFXOVTRz9C+Bf-BIV+#1f0lgGk4Tne9RFpx-TU0H-*11?@;p}I
z9iCC7VCbjxTZ#~UhrWH6J@Y^2_>3c4ex8b9+kT!CZ?NY0I<3CX)3+)B$^6@BS=yfU
z-LSk0L%rOve4>2TXX+jrcTu=L?60l<S=V{n(=dFQ{=Q9*-bZ>qf#lce@i$eU|B#)_
ztob!J`qwyjT93j|FEV<UcHhEsOS>O)kA&vewb`PJuA8=(EN^h;L;gyMP4#Ji2ub74
z$$fj+UmHuO^NzXZ>{qNeEVnE#B8k%5cNE7D>90=m)2#WqyQ_jddE>6__5;@AmN|TK
zu4Aunb9aoE-^GXFLv1!V=e_ED&i119HOreY{2=MGzKI5Wn|mf&@UXwuSMCTL^d8SS
z>v+=kwDqR-mgNo>_Dig|3PtJb`}FOj?D-9Zs#^ksxf8Cfj=k0y%SjksV#a*~tNj8x
zeOGaOQ=2V6?SIn!wBtG39qV0-A))#mGxS}g|6zYk)^07C&Oe-cGUtr*qW!Avx?*_S
zBJ!eBd|OFlw)*QHE{#NjUHQ@6N!J#~9{WM-QBsc2u|IQ_S<f?y;)~p&*~%vZ=e<{Q
zZa8j1@D-%+4Mq=LHa|k2KZ)!4!_IFaGF7&#aKCRx5j^j>V!vj)VSN@kyloL_c@2Gj
zgIQnJ{GvroH5DyIeg07hPC0hj_t}o1$&@ODp_JMaeZB?1Q$=&WOB4-%27<S(FI!)=
zypAU83_n1_Wvu+<VaKP9H*TujR<gI?pzmnzshl&e^NvgQr_khQ6vG!RFIrwf5?{-n
z-$-FwMNMO{Gcf2I^-Q`pJ9jwt+7D4Q;AEVk6nuPIo2@!oayI{>_X<VBZwf)%OV+zc
z;p>*S;tb!1;YX~bZ^J*}ZDnJ=idFjqEREP7<7g||%gV5BgXFxNO+IgS1K!Y%)BbDd
z*oR2`;`X}>W*EEQQ#RFT-}r3#f4q4^&6d#aqJ8;?@{Z-6a-VUXb3W;~iWFYg8CqYk
z+(DbOmcP{no#o+hd24Y`!BE~B&t%SKitSC?_alWzVR#x{zQBk$TYaMROu_lQOP;4F
zc@06k5PStG{F3F3B!=QTwvE~POX$FR%F5`vEj<-lmgW-sUS#?%jD@&ZyssvaFynU|
zSE^a==Zk1ry7q118rGzrWh@gLXN)Wc=dWqwt?TQi%6F7Z2lo4q<Q-Q8=bRTER~$$o
znylo|`n)A;erpQ4gAFxRO~KB>et*on*0T<R+a0?p`f<oQW2NLH1q9}_*_so<(}i<>
zk-%#dY<^Y|d{OjRN#UC?6#FGFJ9~x)(f((l_$4(L`m*YaXgEGZjRDIzK2%qcqq|Ue
z2b(AQug{ju_&pE32)-G!1pTLNf1Orc4dc^Tz!%l0%M8zG{S$4xeO+X8)wa-{qJ8M_
zjQ51+l>4;ntn-5NvMz-%yooMn&o95Xw5h75zO1dJCm;n!#}vVB6rI^?KVTOrJZ?E<
znXNe<dZOrT{)N0txmVHQXCU}IGAIP^!ce5}O<3tKEBjp=*q=Se_<4(d7;~ZbG<+G?
zYNh$?Deho@_7WEAwwgb@sjS|Y&G4bQPt&p=SCA#z--#DdUNx>(2k;`2Pk)JP3|ses
zHr6rOu(^6m+0K&bLK6D<jv54A=be`vSCtfsCTGoWt-r6Nr6L@zYz}r5^%V^HMj^N!
z8QezcBlOsQ$R<(<zp_)sXA92zp7dT39fsg@34-XclEOFGqSjv)+)-j^@)07fF00v)
zSn=01HWat8+2hN{p=cdGDkqTBYwU*0j8kl%+3Owd#w_&2#WijF;@#DxS*zJG+J+|2
zA?G)liRts%H?^_OiN;NJQx)5Tdy4i34)_n}9n%TsoK+(sdMsn1Eo**b{(<7Q^7`89
z1|+aMFyJ4~TkRcJ1h+zP4|;sie%Lx&d%XNq$(g`8zb=8I!#Cpu?^tCd#F`+5$^EZj
z*Km^ku_I{HMXZp_hEVKXY&eFko_ux`NALnV$UgXadMPu`JIR-){aJhqo<#N}ggtHh
z2N3Kbt<26|+W_y|iyzD!<FVfQ%i37?MDxb*RMpn9T}a@*f&=~;MnR>;5M(42Jw^(%
z<~QzNRn!q`tgekzAc4ib=y24##vq6_-h&iUz8ZcfOBn(4FX$484kr;*wnX$ewbMbo
zE5wz150Y>SN|!afho<rA?WTu0`Rv-RLJD`H=_k?k%jl-rD>?qxf=f-B_9d*VMay?X
zu$LdVbB9jAP+Gq!{8~0d(BSeN!9B%$1Ef)76g-}LQpQ0=@B(^##gQ#P&$w@}urt_H
zSyxxt7-}u)3iKBYi5<zsHPyX|RW57^QV72a_!XTkIOn_Qy(C5J(P6P8x)$Go;LAwi
z!u}WGm^VN#wjv3_5LBYUQ6@TtPU`p6;X}0sAAxD4^d!ABdo7IrP1C+;usBcVc=@Lb
z_u<XlKWY6O<Ly|_MC<zc&9zgN+d{ibrm-R<hGi5q2%d4xsgdv=_zeZROIs=;XmEK;
zumda7?;pmF%o7w_GFyAB;$-lN;<LI2d#_+c&|zgq41!`y7WY2@qb@XcJ(93b_=$#%
zDc3Wf%bAY{S|>Qysk1YSFJJJN#qqz3+TYL38pUa-RI^Dnx{i43`x!S+X=A<PZR=zN
ztlCn(QzTI7@Dbm!6oTiRS@WC78xHi8w3ati*F`FuL;{QYM2E2>YjY>#GPun#TX(GT
zWaw1!nZh}2h>}3hHPK;3@D`Tj1=}60@q+HEU&W2P4NYB(U?2!G0!qgF7F;fJa~$FX
zEoaA&lU?a9Px+hSADJBgyQux047+{YHr(_J`0RG(Q{%|^Zax}*?UPL#!)S2D_Rt=Y
zz`%Z)$zVs~GRT~I&UH`xdP~|uja7BERSjjWNMJWQJm`;NN7i}_g2L}a=v2w+LNr)x
zh%SLjhi}SkFitR~vx#5)jP*^7MkB&cMnG}uk@)FitH*86#{;dI4v!**X0P<|zbWk>
z1k^@^t_`Q6Tsz*A;)FlI2W~{h$NI)Qupp7mHB*(?5F}7p5oR*8dB?pc^l=b^+4J*8
z3;IerLQR$7T1h&lB(PWMa11+Q5LEoiPU$0{vLTFto~NYby0Rk%!KM12hp<W2LB((0
zf3jK&*xW*}f&sA`p)`9f75|&k{xl=3gcWjs#6h|cf-T&i5$+HCI^icHAR3Gf5eY2Z
zj~!N{V1nR1@Qda57k7qQD(Y+M!j-xNVnsxUWfYY8G+GS5suRMG5l}Q(B=E9~frbvp
zWpL^Kd$E>v!f)PxvIpaW+mt=W1Hp2g@6rvZGvdFNKK?hUeYroA%wpOY8uB$;4sM4+
zuot1+)-v6+zwtoBA$1&X;HWo_BlR2+{dmuJqv&FAuKaZ66P2f`PFA0&IZ<=G=6J1s
z9IG>ql5mVY$?fc|O|!$cKYW1WpgInPjpK0mu%vX$>d4f6sRyX|iT~tg2E0LrraH!X
zufyeGtd~FpvzPAwmp=YCsr{WO>nI~cJx*B!F?f`Velrrjp?h1$uC_fbdz<$)?-!Dd
zdh$t$u1Y?>X?fLk(|gtbWZ-<!T*=weGr`lLC(2HhpDI6D!BKg#@<io{suNY7p1S((
zNHjc#XT~P>$G737v6G$h-S`VlbL@qx9NE#E4QK8@5vKG0laovhdl(k05RY7^D;I+1
zX^Dj2((%8vZ^XCD{aJ+sstgMjAWALACT+5BbN9B+o$b5Z_K><$9SM@j{JyKbXM4%{
zocn3-72l=&iv{Nk=Zel2pDj64db;#<@QKi=(5W&xJcGg3`o@m>ei*I}kFz_z0ghAn
zs%=9mb(TW&!SI9by#M3|hV)fPpo-wPc$`ilX!g?mBMtw_H1WTw_otf?q8uaaw6QK!
z3c-nijlEkSxP$b&$YJZW&M=9jE|K`}THdg|=y)#YhUc31O5P>^#rz8e=L6?-j>Tt6
z;0Q@itf;H1rJ=FCz9%w(SH@^~Efgn_#*K>O7A2G09~eKg^*J+sX7Rmd*N+cw=-=G4
zwR>CFc0G2lZFed|MRH%W<bQeB`kL*I<Cg0g_tT!IgrV<A{{<+X6N-grMIMXYs|$L|
z+G{Df*VfQg-xpbhU%^Ot4f;HxC~k;zd{FXd3uj)RGwZ)ue6QK{<3sBPHua?tL=tzm
zG9soNKVWz9L(7}iyNck=oa^puxmR_DzKeb&vEUpW=L+3x@&`&gE8FTI*xJxp--9L(
zB8Rca>I6m6=?BG+DMZHgIkWzo#rK+BKQ=VEYD52~-l=XPMv+0%XzPcJh)D#+|L-g0
z5PRMFitV=Jmh-0ThDf0>^j<;|VJP}s;9lz|IH|0ox;3H*cGP#{6*LfMI0{42=ZOc$
z&t!ehtp8^5y=K>s4NnfPmux>W3d+%$AQ&GJbvgV~c4NQBOzACFm0nf^pLIPWQV7E<
zafVO&F8JMJzM;b2U}r^pZA+xN2^s7_k9+ZJSrr*layX(S@qx*o$@-jG|IOli&8{Dd
zPN2aX`{O#S2x3c;4>cl6{)11k^zT~Uu)b=0$u2SoL6Jhy<OD<axGzdTNOxIBRa;$4
zJ=I5B8rvb*gA^t+d|>=c*5}OnZx-KccD>>!8Vo_D!$Q!oCG!;9e~5R=H`$vZejgpa
zs|Y^td`|RONukd0ve!M47b_qrDb!WbUeg**5bSK|MhbOJ9>S(PFn$s}m|=a+tp8^5
zy=K>s#jqfQ>xAFD4yTTUisFy(MfnzU%@2{lH__o&Y%d~%ae}%O%4nEN_Mwsd!J@v>
z?(&YR_F4!wP|8|lQ1n=rLKzJo7(YFJddc-Uv;LdK_nKX=_zkUFg$8pdJFJg`sUu;>
zkMN9={0ASiyCo9%Dx;u5Q0cKqA)_JLheq>Na3&JiQQcPC8i^BZQv~BuXrwhawIgQV
zU#*{JSf4Yy-dMj{D!$k3dUE|qrcvF7B#(j%M#3NAr}syUfS)J{e9Q8sd4i7T^pVhY
zov{#ps|f@_gM(e=9hK;CT}v`SU5|So96wqA%dkFY)_=43UbE}rH#{i>#fGHnaLP#d
zXYl(jBj8urR}~3-10BAL4Binf7J@PoG8W>755EFJTT6S&kU-I4?1&+Q=y9jg;|Hd{
z!Y{-6oLT?P;(N`m7k*e^G+5b?B?<l+{-fVz4k+^e0b}4>N{3&uz7!{jHHP3bF8HnS
z5zM8+tCa+b4r51}8`U`2rU-UENcofX(<RsE%=$0u2btn~&8~-^%wXm<c%cr9En!ah
zXW9>#!F)Te!6JcgT1AInu_Y6HR_7OkpJ;HmE`jLqB7zT4{tmO1wB-7nS^xF*tEJ<6
z&8~-^Sm0z0Hl9WdJEBHH;rGW_kZ+*DA2W*)3Dk91C-?$dEHX&;ur<C|{*VNxm-d8o
z36xO~J2EeWZK}vuHVn<ae|&vu$@Mw2>-G56rQ>_eu214OuffSPnK(t^_Zb%Cn@WQ}
zWDI=A@+La0kAg3v#X3Rwt@VxMs}Ya{2}%O7BIvN#k$HmIu#@+geSfp{IkW4{;(N`m
zho5eNl>{maB4ePwqL3tk{{>&}&+w=H7IT<SupvmGCK9N0SmrWug2FFv6n-)S_QoX;
zD-zdXnNO>6@PY9&S)ViOzwpZx-)nZg;-_2S#4J!zoPQco6oudSlm+=zY48Vldl?eQ
zC@7X#5hVNCSRTQwY6R><0=r}kL;@28Wj@{1^uYKnvw+7k%ZF&$*8k|aI=FXuTWoV|
z<H-6EDUx86oRG2;NrflYy34y82iu9s?v3@w21Ztm3~~&O49Ab?h#avIsvdL?55|UL
z@=I)F$(Pyv$8wnFILOq-GRuc#o&SEMX|QXsZ)jj>aCm5VC^{UC#yF(!qsB2Z%AwtI
z?DKCe-B`K4b}};2INm(oI^I6sG1fWO&CxT~%hAWty0v?K-{io=AXQeR&WcfEMX#<h
z|8J(gt8>dN;IYi|AzAZVTQXcTR8NRz`%ot-V|t>!!+pd3!vn(u(N)n^vB4N(Mkw8o
z_GQbH&e^=_z|NAbWmA=#YBtoZkF0B$Y@BGGXqjl8XcL;_o#Rc@9oxDo#?rrGKow&d
zn&6-u3oKPRme^QK4K1epmRZ1KndL*YZ0moF7mNg>RixReC&;j6sBNg7ki)KMcT`9U
zOQG2x?PoZC-ge$~%zH3@x^QR7_RyA!sjAJ@o9i}(H%2zpuWww}G}#Qx#{F%(I=6Li
z>Dk=3seeQNdML7tp(aw3$t+FzEwg~fGRuc#Eq{7hl;KLGka85mjYuLChgDe=!v7UX
z!Z>@GV!vv6+CG<aJom6~f5G0O-6cCh+scIERQ2Xso#V#(jg1FLdb$&WTYEP5iorq>
zDGOvU)Jd)znl$CN%mN<EEFY3Jze(Skz(@&6#)M#<V%RX;q$suu#h6|er3(}Nj&{p>
z#c?+0WbRDf!Tf!J>7qR)JA>QHww7<HR1|CA*l>`fr@K3Lkn(wJ_f+pDq)?}rEQwGw
z<(KThXZq&FZ%9oaxcI!;U$V{Lxh`*Q0Vx7YAxK#i62+*pD8mNBmLbZb#GuGS$_LuZ
zmK(Mwoo66;H1Cjqf5E;&VYn-ZB$jWfP!t;uHSbg9E4Hho*}5JZ43$3X+HCgymHWjq
z3V19`A87h{3*L}5KQBcAB$;!NRDnXUMi@rPzXZcZ#zfKQmf=<=iC-bX>@Dll_KOfa
zm3u7ji0@$j{=nYC>Eb;lyMsGJN)j6mHIe*T6%Wt}Dk)Sln4l<Q;)1Wc_}9e_e2d?c
z;V-)T+kKZ0L4U=}>Hp6EuS{Rk-^WIzEDD8<3en+Uv@8a};c5s<fg^*VP;6nD>kkOU
zpiG19s^h%tjQdpXaqo=pkpDozzQEohVYnxVBsLtD@(c0u6-K(<WGReO%+PiHl(7Js
z?7(Mw(fk|Ir4O7xYyMNV{3weeg^ff4ONT>>AVoby3Ww_`w3T4UGS{a(n!JhxUbkP?
z33^U=kLDfrAIKL?jx&Vc0k%i=@)cr7^uw?wIz9e1?Lm46KGVyLH!QP!NY?TfFN+c&
z!KWfPEHWqrl@yXNfjuFP29_DV#ltb_wvfPUj!VvSB7+b-mN$bQAHbf-XgFPexba|8
z`HCch-CGjYBpZH9Iq(^PO#V-Hf9u5`=pM_5+!Mbszi6<OMIoJdTn3{;P^6IZJi-uu
z-)4E?E$tN~@P_Rv$CJ)E*J&ZB^w^NXeelDAhy=!q2Z$v$4)j>fX|m(Dlmnj$IsLTZ
zUUi@4L+*i}D(jIyYDi!(suL7jq6iKN!7$4VQps4#43MH6D{|Fw(RnsO@TgC$37RbY
z#DXN1uh8vCT#t=8jVXjo4t%B;C%eDhSNV|a`NhkktX2}JM#11Pso2qCouF<_YT@@C
zEC_`{n8iq$1}PMB!F4uAWbh<1D0+-F5q_$GPjc}9!;a_-l{GO%klBIH^zwQ4xBDs|
zau57eSr5@*T>_O3&l6N5A$m+A4M`O9J{tV0lECZe@MY)u6oRupTwLKN7KDR&wC?aQ
zKME$K&=i2F17GGAmYF{AUiqoQMluE}36xRLkU`y+R4abpU`qxK)?QK)C^{^5WL^f3
zdym#15%*THARP1MD;5&mW*Pu&7QmYspH|ey?n#CK;(~5GG&2MMXFY#mVVN}A6gEPG
zbqN$Zyoey_Vx&-rqzrvSyK8xYG4KXDd{x;IgCK=M;{3#dhy>0%+$7r(q>%1s4gfX_
z;5FgbY_XR>aLEATuuaS8V3<8WDeFNha?xPMK%=w~iBgkfFj@saDHQSv8zgU1DC9-G
zPzXAFg;2UV*BLbqp2|HDQSzr-5Mv%4*WzS5BK$H30GkHzW(G+3ux1J1L9;{vG46#@
zKV``P;B24Y^s*?5AEn78CGTpI%PI*JOKfOyh2r;ZEQl%;!YoFYz^Cmp3SJ-(hcXvf
zl2bZAV+2f*z+D|l1mnG5Isn)-fHyNg6+SGLDV7KzR_=vDaESomd*Nr4^)MumnRJ4n
zl*toYO!_6gP{;?2fO@GP3WZp&D;>Us9f=c+^J5i8Ka4VPN(Oh%7lTXm-a=$Tx&U3%
z0Nz9kX@7`^4-3Jx0mO0lLLrzo0J!{R$A^}$v$30;0Lw^HHJ|?KM->-r*`&4lVYZ19
zeD7JZNq?1^04rU9u4w>oT5O3w#>0m-j}?Mx0*K@8g))Yu4FGPO^<Qz_wtdMW=@BH2
z(tMgKBaM<ufYm(g-|swUHA`#t-E8?4-Eh5Xd(UE){_0N>9o2tW9H5)L{+Bj@cR>pa
zeocfAYdLlZE(#z{xEE^V+)on#+`hZ?c-}?Vb^8m}S4bWr39$53N|KN&DFIe@V{lF0
zPS;WUIqOrFXR-L{(^_Sn=qY&7d(-)n?G5X@mQ3ldzPF&e8nH2{0l=mKyo*{$`9o6p
zu!OcP3Ls8$FVqN{Srh==wYBm<;Dq;l&Qp$OY|j&LO9~d|zIp;IN!KAsPrj~wI#5;J
z6By6k?mFN&Zk@ASwxmyM^-;F+S3c*z>VD4gqU|;7n@aZQQ*9-uzxr-UXZ2rF19TU!
z|D~Q!(?If{Qo@I&{>*{^V&R)`FVsjhu^<4rcXRFTlEe8YbI;{mc0O&t3BwmjNJaV!
zMbSur_3>~h(h?lZpUB<n+UuCHpM>Eh%i^?FZ)eM|@^s)y@6$QYIqul*T3@%m1;hFD
zEy=Q<+*#x2<N)1t>wn4Tjcb<pe?j=L=E#9yQULJ+_rgU1z<nF*ca-fbob{c|oy)o8
ze9Cb>&hQlrscf*Il2Qq2jFvV<D!PiI{>j`eu04)}_M^5ZV0h7zn%3%W*8H9*I`6yU
zx#7A6LDFAY->|$5LrKkMB&m@!LZ2?t?fC%R^y`1~=a*=}`16ABVeE+A4Z%bJ@dEck
zu_Gw~zys?Ww^r^hIaqKMf@gCsx~@2`Id0fUfMva7eHn(L&rlpGYOASf3HAj>eUqLk
z*DmKi#}WH+>l4;<Fq}_o^+LA%s!o;6<zMn%b3X&Y+qRc&uOfwS#u?71zxt}t&qe<m
z0lG`A{~6~OU6}fZ)b&%shc&lN5sU{A>wJ^k3#s{$900s(UGr4U&d}bX!}-VZPUW6)
zUvOPP3ZJ$=6K6<TDxtW#u(KlESl(GYm_M2~>DipK13f-upS7KgGt|>s-N=?-^~ut+
z1sC(Kc%II=>4c#DCG_|;E9tM|42>i|sohQfO%KqO^_PqRyvdg={yBO5G#);TCANEz
zK@~tO=NGsaN)@w20Px^s>!!Nx6?;nd2WI@o;{-1`uOfxlZ92o-l8%M6Oyv#Xs@7m{
z;gElgS0^~_*iRB|(pp(hqstd8mq?6~WuP_bo6Z!T_h0fp<-YDR2uj+VFQLiv3_n=h
z<D|bxfNrMsKNY~6^!_wI8tbPi;lrBCVe?=s2q4zaFK{nR3IHCQY~L8!TD2=QU39Qu
z)_2@{%5%nj&h?~Im%<xyhA*rs=m|E})HIcKmh=Z={<Yq9$l!L@ZpS{PaK=trD=TTO
zEElrp7dl-umyZPIUUT1YJ*#B!MVm^0r8N01i=@9IK~u(;NAb%Tz?<><rNW1q%h+-u
z7)s`w2>;bcwi6Ek9-8Rb*f3SSvusbvzQV)#M|~$Ccv=X$F2|+thK&SRYxDa`TdKnK
zm2JVEqE-1LzA;5`n~Sto4$@j-PmT-2toa=;BY8=|1>Ys_Rdo0n2tJP%3qh6s3MnLo
z7!1u`4`lsM$}gvc4{IKaQxROky-@0JsQ}=i@y-p6o9nh!><&&B?GGIBAIsAT&S6W0
zpp1oRGHIDg+A8Y9)y-ucC4GS*|7hNrcRez=%|%)(^jMd|6WQ`p>6^|5&ikK40!4>O
zf0ZDp(qAEkl0xhue+`d!Z)hgazG@c|xVI1;7J|7VgJ*LtxGpI@W-KHvQ*lRmV{L6i
zMO(1DcpxzBC#{ttxYf1Oxd%Nat(ATDUig)sDLhB&4Us_6VbWhIf}|c|Mx)YSS<MrG
zeZhQ($GX?IY>Z4*?<hwCi}wW%p~I56L??J2f=UX<{ewlFp(aU+(;Vt3A+1$G6gx7W
ziwwHAx^^%poJNliWzSEgZ<<4cNx+5#KAoco;)Z_PZV)61+ZT+-^y7!ede*f{`jTyx
zyGWn2w`hOC5k>H1oS;4uUKuAWwL92S8L6voEN=~VVMR!5g&moI;3f!)E!pGPe=q!k
zr&ao<iz<1}HFQ|)2+5n`1SS2|tEL_Jg5MAw>z!=d$OyQlYNsxNg$MIz{33%o!81AM
zDDQiD!be)B($<OwG`O+_3G6BAM~6pz^903~?8%m&O5cP9iEA+Fuapjp9We-sEqUb&
z#$)>N@SAAg&_s%i>g^S}1d0yN`i_x)C{EBl=ei(itw_sME{W-?nn=skg%uGU9?e_J
zY*3d$2;LJv5&)9G?!4cSK+$0mQ8Jf#-thuj3_;Tle8F#sj`dA+Y)}cD(BQH?G6s@<
zNM<rA1Sj&M0hN}iDqL67P}zzEb|ZnLwZe|f6Wn?a{8aiONnodJh%SMm!=%4T5WMpR
z<1zhs_%VY~37oc6i4D;cI33JCqI8(Kj2Z{eOn74j{UxMjs;sZ8C9Rc`K+;;F!!hhg
zf?)Ris*Lna!tY5n29o|tje<!8O*`-fzacu-Ki;{%bz{Tkx~b}I6$uFx9Zn_)Khj!t
zg%|;AYQt3x6(WIM#iX@Dhexm@GM`3^?~R{W5T(H~iy{4$q_7hmW)zIe;1`U?^y4So
z<L(LfB*!|AtV$PUERmwsRd|{`-R=SRusiA=aj$l-;aKY)<8#J6<DQB55iaTR40>W7
zIj^6caO>wL$#G{K>qwyVaKBcUP~_LYCT}ffV2tUUen?OBKCJV|kv&)Kf^FE}7_6!e
zMM|5B+6p@hx(j;p`}_m`RsKQWkZ(9|I4|mrdBd^#I*d?JZ^1zRpnu30^~Lf=@<zR@
zIo8OLyOv`t_u;0eX`|J#V5~?L`Sq`svhDG*=VQ<u<NFKG365+D;<d5gxhAhC&=QPP
z)YMdj%Nv4Cr7b0G#T`YRh24RkKyRS0pg%vnDbn3o-%{0H)>+z9+*{Zm7$_LbAIcvl
z*C#AFMtq}rqsxjd`dzRet&df*cPr)NDSW;<f3+WJ^sSxesPp7OExUu&<8#)XuJPQ#
z{H~&wU}Jf>s-~tYTv1=%7-|Z(l(v?%m2{MJ7L!aYTHn#w&|KSA*-=Ib`jXz_zM}rZ
z0S44T4oGrD%@ctAE+A$2jM3&;U9=Ma0V!5X(QX(PNa1^-C>$vuuLx#GZ@sK1-O!HO
zx4R~CqyD}?XK`z=sjR*-TwPmRT~`^YXb_H}mJlh}#v=Xo?TwAib*<Iy6&+<=q3+Ti
zgJR*TIL8cM{h>HD+7_#i)<Ccff+ew{SmCH*h?kH~G0rh-eyY&C^?+lmYchA#H<;gB
z*jdsRY%Xi4h*Z`I#oDTHWqk#y0N3L--P6zp!AMJOTNMn;yFy*To;XD%jfD>dqzqqT
zv?JDt|F@J}En_4sjTS40g(8PYVuE7U{8XWN%W>;o#}?P5XEblf-yi5H>JWyZ#`1>B
zNL5`;U2P316jcHF`mTmnMX;{5roE~|7?yPfyLE~seZ?8R`a^MQv@6yUtH*n}YB)|1
z?`9a{<5##^*Ji~qYksQGyf$Z<vF~wAL2!*X=3kZH7w9hTC}|D0gqq44D$!?Bd#VER
zk-qv)2sSm<w}e}3+p60u;|xPO!%}j@N;d5Mq}e#kaU^lfuy#Cy|6a3YBMCU<{Gl}O
zFoS%K)@R7ceoU=IFK)k}J8YAT=KDx9y|DcwTr=7;(iUq#0&9{8mJW+&PzuAW`6UX?
zTMycIIXCA_de(YJd_(>L3~v{vqczwXLYqm8N&)%ss1%T|@0cejQaI1BD=1~+J4xQU
zjWkXB;*|7$=zZ1p+DjTqj=O~vs6zRG;yC{fGk#OJ1#M7m?@ix62?D7yO{VD`iEm)s
zDY>7cy(8_hrf5A;9My4xVoOS+B86g6@bnWoya#@&&^!`&)V9yD!?ihQooCEDnm6oU
zRUn!iX9&M|0eP&5Dj<&xHjpBrPDx>cq4ZyBBMjg`wh4;6=!M=>A$3s0lVq<YAW}L?
zM)EE6QaH}P!;Iey^vcOayhfWy-@Zx^m`7y85Ke-d@I}~(w96;#S9fc)Z=^HY5~F;4
zv=)MuN{e-Zx)j0?O)kuuUs9oY>k<3j1i{tbs9(lHG+7vul2;Xw532(5jDo6wyg?8t
z6ir5#_5NGP1lvk_-8C@U3`L=|NA-d|rt5Sp-=~Gih8u<Q1~hX19cKKF(N3S1i(AGL
z?Z*=YTG@<i;VzD%3mdsl+u$hQVYGjg^6_yAtcg-EG)@p(f_J1Yg;?dR`6UX?BZ0@Q
zhwRgi?a1JI4~6C#33Vw%li`;rARm#kq4NY~B*dDuDNUa5e-VjcJ7KsQNfAa{+4I?{
z;jeHP-_Zdr-(xMqm$ZkR*&<6D;4JSjdmTe-*Few0mTubilM)1qTSzOkql_0qQ9faG
zV3hLlv1TMNf)#<_P^GfPA+g5Fzf|<N_#XJB6q>i~x9@gr(+TFvNEr1GB8C0%TU0>a
zkill1U@cNuliYuCOlgMDAW|?!;?9jw#KY-rihb<B`!PSqUWl`DOX3}qP&9kZpotSu
zDdLT3+fPXlm_yFpI+7UoqRsLhCUt%kpN_?gcca7ALlV<L`FIW)36&Jy1HVL}dF`C#
zBsxr?`8dG|2#)4aXx_hyWZLlp@?i?dV@0I+bO*CR3dlD#LQqCRq%gJrK9v(&|NbLL
z!Gwl?*!vVdIswjRE#Kud-XWA@NS4_vHU8JQTKFW||9}!TXHi(mk>|B3$AkzBdq|cX
z(?(Z~_QWV3j|PhbQoI`-u3AK}G;4l~3eBU#6q;u~y_w}5A-Ebnj>2zI0eN&dl^}Y&
zu>W2ppoY=B13i*H;=Lgnd<iic!TT|vyxA%@cms)`rH5v(h4H^>+7}JJ##ChkVpq+c
zI-_MVnotkJZsFGpKhfaEAyqzJbU1~e%&N2Jmr`h6Ng##h83lJbx4Je#Q06mhyd#QV
zynsBj7+nH;gdl}~4H-m_7xzCGU8`iQYM}~9zlO)`$M`org9L5`7d0jvIUZ{OIjw_o
z`{K(N{AF?c@1pj<f+ycCO0n$4PIW=BK(mXRV2JzJpp6bPgQ0vpv*<=5q2m&W4$FMn
z&|(N?Eq_M*TT-EUba+3rX$s9J2(E?Sf&%hb5mi8*Q7}%BId%Pl?y6ticuAFMwOETb
z;r9vtb+@&POlH<0ay44M-R&mxcquYsp8sC_tM*_CR&!~DYh=M}QTy*vbn7Zp?Crd*
z1A+m`Kknr|mJ7c=EJz0$tV^I+5t+$gN0JDJ?tx!Qp?Rz@h30Ll&^(sdAUFoUv<2i<
z@o9q~Gs1NIq=yAmUr>C$#t8UjN^xDIpw3PPyiWcM!5q8vuPpuLNq<fCUsK8L6~RRN
z84?7_w}4;&s45?ykU*IUiVmj`%$i@CLi5aHC^U}_izU_xuA>Bl@QW9a$A-j6Z!8ih
zI^3C%!6oaT=P(eJQ00U59!UZFB}$f^=W#$DC)!Z&e7ncx$fbW~FYyoBhxy)sJxz}P
zO=>?wf<UJx<>TR}%Ex0v8YMTbZbgtleJ-Q4_#XHzC^WAm@R*fC^XRZxVntB#TcUtG
zPsE*AVx`4P_ur-YWY!`6Yf{$!HX3%4DfK!=!Dhhb+dM9($FlVG`WeH&YH|E;Qu|W&
z{bknW=6Lwt&L}v*;9HFm@oKi7{VfNZ4mBQbJlt@^IHc@H{fu!$W;kYhV)dIM=4Cx7
z5$KyOnhnmD&sK6&%~sFWsH1jPKk8=n!!&X1tp1Kn{dY;~E8YL^9o4^cs()jc=0=7l
zzb)74%yhk+oA<9;9RC|{Uo6y;2?7VVb?@q!Zrj_kzxhD(fu{7uKZukZtZVIVjMk4u
zGMDw(r@d#n;kxLZ^PdhpQFN;0Wa)|EiO}(~<K-L`$19Fis-x=IgWxBwwD6Pu83ia#
zqKunS#crk-RHDI$&*YUk{x{zKR~TW*NI=22bK0!RwmO2cHzR|k!<+lIb?@xl-44Te
z0g$vLW5KTamX5~$`e<ZzWIVD|S&!YsUcPF(>AdQG(tF-Fmw&e4OyTLGCyGy%oGLw8
zdNO!2bTV|JjN`%Z1CH+hXY_9lf!c)*4`W9nOfgK?|4rk6rTq*E0;3xSrh0`S5v1*V
z5)36OZJ(0KgSt$vF6ymnZ)k3B>}eQ);b>%SBwbmL?Zkn-XL-r?oa5=7E1pZ<i+Sh$
zLh)=sIG!%nNuDY_^`Q8ft<RZ=pJo={Yj%BX{ov;Qtv%bicXa77xO>{BjY2dq)DK9m
zDHtg0tZi#-Zfopn=&N5<Pg#%1n#forwXDY!(SyuqtuI3GhU=PJXL!MP-hVEC4vJ?B
zb&jVW6hGvxCe!+yS^v%Ad(EyNSvRz4U~``k+#V;ms{?Ib$Z&1`U};ZfM_p@Ub8BN~
z17$tx2a!X{dI&{T97S|`1Nn0(v|@S9ddGIl@r>)~oTuDZaxZzG%)8*b;5#Q2l{B7x
zVEjzh=gj(V7T;@j{mA6dhE<#TMTdppeDMl3A|@no@0fq6xVNmcx;@e&1UnkKk-~vE
z!%-NDJ|m0dFQL$i_NL{oB6!nz9Vxu(Q4G;$T@s=A!1$T0&zbe#EWX$5`jN@u^@Eb_
zXNw{j9|_};;7KDQWhn~#gWVM!wQUg|Pa%jNtFj*TLrM-Q>miaz{tZ=VMWfJ)^)@ni
z(|JQlq0Uf7M3Kbv4~(D5`kYz+&Ek8_t{<6*t{d7oK<NOb!y<({#Fiu-YDApy#R3B*
zy=7fh?RBm75Tv961bdJ|%6il%N~4fJB3@_(9j4HV{g&fd=Q9))yyg~77CBVf{J{8`
ztk0SC-z>h@?0UsdG#DLbCZiwoBcZ`@4`nF|28;VbNMKESSSQ%j&Pdpe6zZBRk1jDS
zQD{Yb!}2QW-Qxt66zZC+7(Ot53krBFT%R-Rzgc{*+4UpiG0F8P{N{BybtF^_CrIi%
zRMcPEQ{GjD4u@MCL<S*9Sr4p<NTG~|L-0!}w4w+~fdSEDkwTr}<p;(uO#zSO^*OWt
zo5lB<T~GRU;ipTWK9ez|P|XPqJ*F&0{%|20tjc;&RDq%tieQ@}Xh`8ImH!}KXay@m
zp%vybx9tW&84E>|uRbt-M*Qj$>vLxPH;eBzyPhO}!cVs$$)n(ckx-ST2&^jZ3nGD)
z=&&TTPY_gk+#_Y8;FnTp1s#6bAgD&dxD-A>exLC4w&ePpS^v%Ad(Ezg-{i0m6dR&O
z!MGjPrO=pF@0#?kk)-0H!SS*lZDL0nnnjB#N&!L9<6ihB6<Q(1yDGGjOi=0Z(+`ZF
z$@-jG|IOli&8~;vgyN@baOx;14CA&0em+%}qJ**(B7xOX+z5h=Nd&Pb+M+@$=rDy=
zm<=i|Mh0ahyzv0}Wn7;#>%Uoiui5p&j~UFo1~1fMWlK6m{@^!Ms5CfU)&m`0L=b){
zg;tP2DVC%Pttf&rpSkIL=7I9dus&zje_8*_6yIxhJ^XYFJg>pV(}-b5<o})VvJ~;M
z9&rh*L5C?yk&r=b3H%lnT2VTjO7PhS$S>pioLT?%_0y%}d(Ey-;%C^9#X1Z@;it+{
z6j7EUsK!9CB9!&e39@S|<6xWOmr`g&%3LTN)(O6VEk*_(D8CHrb7uWF)}K?#`zFQr
znq3b+-2#gaZ;IQH#X7uAm8Hm6WhqMZ!ZM73l=X<~uso3{f{NdQLMzN;UMDtRMnTGi
z*c8DB$}hwEoLT?p*Po^Qa-zI%BEHw`dc{w-z{V^vZif?3BTA1IKYx4#OqM{=VHpKw
zE(1Yri9#!J9ZnE@VEkNWJ<NRGe1|>r^^g2nKkjYIcwJ;O_xJjh#TtDkK0jjP-E@;F
zQv;9sxr(^cz<u`PrzCsawmcEO&rg0xE~{dnEkH(=`+WUM`+srYisAqFd-|$kue^k<
z^!rr8f7bpN8}O&tuKPVDSdF(v5DD~Zj{A|If6;!<g#Qkkx{dd9`J;bLHU2_n_)_>a
z54pYW`Sq`wXz)i^k@a|V{T{xAUpY1;fHOzVy%YR5?Vq&&!Ir~II2Mc`%D;Cn;a7pD
zK`}0kUV=qz_eS8i+OM_0W7Fo#xF7AoF(`Czmp}UN7Jijza0wFVLx-LBM)0@VZ?vCj
zKi0m11fIrqrVCO2y?RyS#Z!p}2XSUAK!@G;M)2RYf6@Lw?XTIikwB8e$`}8k54wAh
zH`w48LIMl1B3?#8NA?rqf7kw1`=$0j+4lMaT;R@2L=1k@4}I9@gCi*XWCScj@k9cB
zxG>)X!GG8OP5Vbiz`q~`_B(Mm^dVfg%sp2hTo=C3mge~du_1+6kvv907nV4ybp_$~
zf3^Ru{T16@UqJ$&#GiZv4wyLdf1%y_oqf2@Pewq|U|j;eY81?h;D2bp)&7||^nXW#
zzd@kFO{~ZsB8wzY!PI&1ch<>ee2|N7gwW3_4q>(M7*!!@J$Q{r7M+v#n!Kj$1#U|L
zx?j#`RH{!&`JAJcu+8R1u7iva@($xY#=A_~e+W)nAy^v!_Chh0WUC4UX=Tu_UDUTM
z_}$fpEPj{O%j~C=`y>=oD1C~9VYOzrFeVf*J_yD6_Xx{J<CRPQN)q2*q`;5^@fDX$
zuN9m%dY63Oq<?=!@K=u(a^$-4mG<!|#fqa)Qk?#l6x1`kqeK=cY*6&`o+9;KLVT)u
zfQiZH|ISY5r0X3Nd(hu9I+wS^d6O^ra~$q2u<uc254b#T@)k?hSe;NQ_apQxEX$h^
zl;FA|hE-k;{>#I>$h#h;siKp)o%F3Z>F$>^?lmEMl4mFVc@uvBr2T|vw;Ak3ovqaA
zhoQq`%~hXNz^GUJYwhnTNOYTbOt8jP4YFK$$hIhoUhdhW^m3p-#+@%g&yWwSMlJvA
z>woFaoAmED+W*02`+00`mlm;>JBpmX9Jkx)L7Q{=EdRf1zu@WaE6faMh&LWkH>(i5
zPFXLs-rt&;$yfS3PurOr%Uu`Q5bY^JB$C(v7N0lyHuCiGV;qhz5x&`_HCU_crOtpW
zPcgK6EV(i}{GSkhTZOPru~IF85#@ZmEce{wVui_f`JJ4$GdC6~^v4;>(?|09-@@}I
zUrN@menQE=yV?_Ehv?E8tu?kXq%hy*g`vwM>ob3^{W(_TRqZk|xRnsaK0;?|5&gnP
zOnu`hY$%UgNnt(=g<tCWU+Q@ie!}OcIE{V8%z2i;n=Y-%QfsR~3JYES9B+=SFZ=`R
z3*SRV@9=aoqXOUt31e;Gqe-vwD87VHjvAkQir>QZzvS~K{nOX4{wq(vuOnXvl$B^^
z&Rk^=If|VHE`5FBZ*U9v7SG4Gu+=k!$*x0>dk7*7o3#GWp0+V^=b*<@Q77H{-~4%#
zE{J^nf~U9d6ZUhP0Ly8f-8!{qOT=1jFL#tW<@xxZ(BQvB0>4U_>`g-8Bt&cjq0|FN
z;UoJJ+vh@$<ryR0`k!&$q=O<~zta9z`+tzHw+K-+g0NdG^^AlScK!MIXAu4oGWr35
z+Sj2x!+z5y@>dL*wEj?^wlWcOCipF0|I^Q#bYWh;B=GEIba<Bi^igE6#nND{wF$pp
zv%>l}Sn2N(tSIH?u91#oh8fQkp}>#yO9+YlEw%n9=S^Po@+IppZ!^N3X9a7D=Z#LS
z6@qg9H_UN<s{J=sX#Plo`?MF>D>(zfY4Q*~;xB6wzjW(=rfYz@eEkD+yg%nT<6~wi
z*IB{Zk9NvTuhsG&nv9VD$SnIOtg(I%#_#i<=O`4T6MV$^NnJ(Vf-GbG?>AVGzr#ZO
z06On7OSwWalxZwP6oSI<U$G!RXN>$yBv6uLNU-}&X56Qd!N;%11RfK3OyDtr#{?b|
zcue3ifyV?M6L?JEF@eVf9us&>;4y*61RfK3OyG+mu!_i*AF+D=E4&>4g_X>IV?F)1
ztZC}Ue`r(oyLkBgoE6UBEI9igtbP86>DT@ayBS|4Wr(zquKlByPgyzr6?-81XB(fZ
zw{G6P=@<Ci-w=S{w}=t?U-a--aQZc*^qvaK0{4*RE<RL0VTJvdDQAWAZ%x15HPS{u
z#oqY$*kMc8e%LmH@4;WOgZT?QIewkQSig&=AMg;o!O4!;^Z3XA0E+*EmG+;pANEW3
zB>pclX({l#df3(eGCnFlhT=~-`wPYSpA~7dum4r(wZiauq%d9kHg`3v^l!2^@?-W$
z{vKZhxl4N6au*+smzCX|Q|wH9#17b>(v!cYFMo&E_s=!0)ang6y6NNdNYQuD_5X?_
z{VhHH8GZeQY4`p!R~^KYW$jY!7uqd7w0;{Otk2kE`fGMFrEUEV{*!6@56c@Y?B(o6
zlV3v8zCpCh57`a-D@ft{63BH|*}K?}y@VIgN9=NcpWX1k<eV@zdx_uu^U$*`)qYU(
zl(3sQg9qE&X!7^)<dC-i21)-va{nHgmjy@N_3Un6h2gts^B>ZeA8PncmfCUyt~y&6
zzEMxGFZMdSkl#X9eu%XErD^ZP^Zu(U{@0qm{W7h<iSOwI1h27k`5|q84=MjK`@ug!
z&L6E;Et0T@eZ%YQAije}eM7@{Bc%B}?owyO+QqKzEFJ{6@x=I;bN`7E&FqyN|7*c3
zZBhFbcp2J?kj{+|yv!K#4!rey_ZL~*AG6Q@U7xSSYiBcYYUk0Xm*J$m*8+Br-&O8t
zfZ#fOsOIovc^PdJhGwtS_+JaRV?q1ko#_!bn*qEMX7FIUg%{BWcyj!aF8^j9@X(zP
z;iET!6rP|jH-(=_NUq0S<gBtcLU4k;?~_Q%Ewt@j({H~x{&zw965B~2O%a~EF-C|P
zZ2e8Vh~80m=R2nDKa6iEWyUc|Uk;(kieIVbcX)g`rOp~#lcf_6w4Lbjd9?hF>DNye
z|GS`lER^QR<u-NVd%7Kh=V<vw{9xWklON4jG14%Ehs7T8-;fb7V9j;A1FkYht*se?
zYnU+#-7BWuOCSH6)V|0!feUu#I1xN|$GJB%j1brGB6<}md{ke0Z^m?fW%SVN^yK9f
zIV<d8MR1hSYCk$|_F5|bH_^WEE5YB|;$^DTOjW9Ncu&sY!FCldqL<B@e~8Z)5F^<o
z{KWscL~1R#-TAIkM!|@+1sNQ{8c&;k|E1%9<L!%gYmjIehZ6~`#p5)F-0z0q8Kj%>
z_Yr<cJ-ax+Qas6XogQyap)2I5w%03y(o?fnrug4@`)Yj9+!ll|gx7cvp1YeEIfURP
zv*sW2^B6|mdoqY{<k>xLw?C)YS?;K@HCS7b!Kmr?&m8}&+7}DV_@G&H_>gM6#s_#_
zk(#hGV!s~sm(EY}Z}=^_E+nu39j>s~#tE8rPxn8~5dSOfOMHn>bJ+MNh|hfowrd>U
zQz7`Mzm~f`w*umOmvsx5yFRx<;(PC#;d0mKR!n^F{SsW}`rHbN@4Y{Q%UqvZQSrU^
zLvWevb1N*q_udIEbA4{b#rNJT!DX(`t-$!+dn35a^|=)p-+M0vW%Xbg>vJnKzBhY<
z%UqwEkH1`D@x9p+T;}@ReEj8#i|@^j;4;_e^!Upa8sD1@!T9?9{jSdmzZD$cYfe!3
zEn|Id#mD!W5nSf_-0yvS?=lN`EVF!w+1-8k&o8rp$1=-@Jp6pjzTq+pcr3Ggh}pM$
z_|Gr1fX6b+hdlgz%)a3=3wSKEe2Ce%d-%^Uvw(;G_;TOnLmqxUX5TQ`fzRx{Y0k^^
zaT(=9(zN)9{VCOf@1DQBF@3zx@*(&9c@NzMiyipx=_?Aq)bxS(borzI=5!8x_s}ch
zw=jL+JzW0izcsxBU)KE+e)0f)pXEcc{+x&QjY~Q3W%-4dN*|cz<&XZ`mUiIFs&59r
z#pwgHy8O|9Qzi$#EP9tIeP9-sKl*RU?7(N*zj=P?(g&Je{^-BK)PXN^7j=I3T|Ojp
z;}64GvjAYT0A7<m*ry1Z%Md_pIZDvmN)7-v3*a^3=Ps}eL2$_cVp|v|)0G?mY!<+4
z!mq&M^5YV|WB{?-hRgFx4gfX{;LQxMRP$OLAqs#j5kOq1S?us#xdFhY0lb;{iSvg$
z*Iq>^N!kEn+zT}xPF8LJuxS8qW`1h@+2#t^Bp4@605R@`QkiAt1^_3o|D_G!&CE}Q
z4{L6Zv(y@a;GzIx<z6V^mn$^@ICcFmO#p9Ze!BaX$L{hw%B>9$To6F4+zU08XMbe|
z054wuTM)o&#!rP0iv$+gtE`O>ObH;4yBDt90N`}%e<=aH=KKgB);x|JpR?3fV`)ND
zlLCn2?u8oFMpkA3aQgMXqyXN`{0#S_Tr1Uj0**3Ut)*E7O2-3;<L-qTl?Ya50Ps@l
ze~AEIbAGz}k;j>nhYnZR!ir!#fH>h^s1ZT95(9vjUjK^+@S5=x4JCY7%e5ha`OZ>C
z!HA^=8B_tp3HL&o#jL~t;7sd(Du6c|eky!eLjt|7LPyA6ZLL=X2_QD~o63VLGXOaA
z`k!eDT7CaWg%4{UyW5@PcNRO!Z8g>gODi%MO>!^PsJOK90)Usj{%7pJp}`i93kfVh
zhbwHgIzizZ5C4_=p(`%{IJ0^_KKq!!V*-x}JSOm%z+(cB2|On7n80HKj|n^`@R-13
z0*?tiCh(ZRV*(E`0T(+HA+`>yIK<a+;bEV07Zg}Z*~6&7k6zxD`aZL+jgkww4+o((
zvg<BvTYk43wT0cbLSmH4)F&99VSI{N`(ONdD+Ej9-(Kfqe0S-ecGoiG@_VfT-V;(3
zh2wlL&AxvtIbj=N*u?+ySIv{e_isk4w%9%R!4xZsa;NmqkoP|tuUx#ON)q2*_!xb~
zcl55-;|RHPb8SA}RjfFcK~nFhS^xT==T~<pn7;j_1c5euvHW~Sk@_wnt|~KY{-HUa
z!`8!MI+YqFB>n{N$8LL-E8zBc?74{^mck=s`u%&5y)w2Y^OkCVQG!4R&M<i-G!~A<
zrtLpGZ*a1KEqA{->F$?n6odH*B|&DiE?b?m)a8evBiAbLEMT+Dynk(IZUsDKdpB+S
zX%hrmG@<B)`J?o5pg+c)FF{Y0eEo(pCZACPWslZnjX27kML9k<&Oo`gT;6NCeTmhm
zr5!!~H*NbF5(HYI_-MWCY{AN17ugc+DbXlF^h=7Fd`K$mtq^RmS0ROgoIGKOHe1a4
zmm2?@ru`L`AkeP4loaavS)@_WOf>jASdl02e(bU~+G-qS&LUTSju(ce=}C<JT_gT?
zQTr<{L7<J1N*{aj#kW@a_frb$d_*$(S-c;+EKRmrdj(QhnB&j!n#<q9_}>NXufPO>
z(!(4a^2BDU^i@Bj=++-n_~<glXC|~xOS3I(uX2Q(#V!D6e(C-jlD<5jk->$Ffw&nY
zwZ9@01lkz6bI@bNOXTAh6oL9a1z~Pem}466M+jOY_G+w2X=ZwQ`uxT5zbWm@2=Vun
zI$BW)0>$>Z&|~<C2lU@kvg})wL%Kofo}H9W>9n+1>uojm3Uhv9LCUegMT`b{wCGea
zm}p=3ulX5wXoV#RwBi!#6n=($e1-(Rj1JFId}dS;Y_Qd4%WrY~Z@his_iM`K{S8+B
zdz6iOhtfUQD4#OJjA@E80*~-ZdPs@T-%?Id<l~2w+j^U_QRgY$GllnKr==B-dXtwt
z>52zd8S_bzKp*334zg|Kx<vacHbEf0)aBzJC^q=#6ny)bqD0pz%dwxiOiT!x;8SZZ
zI=^)Bzp8y1VgHd?`cEi%_&x5?`+UZ86rVYR44N0XG5Z2v^m*a;8!X7*F~a|V-n~oF
ztt%8hnx^<n6c0#~mo9%9;(u3og23gj&#i#?UK0s1IlavFxfK!LYl0}_>1D3Zt&sTM
zjEFLxT;}@Riiz*d2r0wKWv<Vyp!nVlkTRTF=K9<Ui|@?<YKc?JT%TKU@x4o6TJq1!
zT%TKk@x4o;N_%3N>vJnIzBetXCH}a~^|=)q-@62?G=H97f4Q&qIeGS_yzk13?@fa$
z?Vsk?U+!ytPWY+vzAG=jH!ZG3f7I7s?t6W1MaTCpf|ceE`ufW+*+cqotN{KID_Cz(
xTKj5zb!khQD^mWn!sB~W`kMBaWft&Q@$tQBp(Xusnd@`L{x9WyS7dze{|`Wm#zg=C

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin b/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin
new file mode 100644
index 0000000000000000000000000000000000000000..db5bf73f7d5a0b5e436d336849c90bfbc24d76dc
GIT binary patch
literal 1024
zcmezOkD<Pvf#Dy7Vt@dk2uKi2{R1+90K~@zpc={6kIhU{#3;3&QvIa3l@@A|qY7?5
evLH0#aK#_8QgZae^^nP+)P73!lj-bXqYVIqI9W{q

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj b/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj
index 9fd2c48a52..2313cc68f6 100644
--- a/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj
+++ b/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj
@@ -9,6 +9,20 @@
     <PackageReference Include="OpenTK.Graphics" />
   </ItemGroup>
 
+  <ItemGroup>
+    <EmbeddedResource Include="Effects\Textures\SmaaAreaTexture.bin" />
+    <EmbeddedResource Include="Effects\Textures\SmaaSearchTexture.bin" />
+    <EmbeddedResource Include="Effects\Shaders\fsr_sharpening.glsl" />
+    <EmbeddedResource Include="Effects\Shaders\fxaa.glsl" />
+    <EmbeddedResource Include="Effects\Shaders\smaa.hlsl" />
+    <EmbeddedResource Include="Effects\Shaders\smaa_blend.glsl" />
+    <EmbeddedResource Include="Effects\Shaders\smaa_edge.glsl" />
+    <EmbeddedResource Include="Effects\Shaders\smaa_neighbour.glsl" />
+    <EmbeddedResource Include="Effects\Shaders\ffx_fsr1.h" />
+    <EmbeddedResource Include="Effects\Shaders\ffx_a.h" />
+    <EmbeddedResource Include="Effects\Shaders\fsr_scaling.glsl" />
+  </ItemGroup>
+
   <ItemGroup>
     <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
     <ProjectReference Include="..\Ryujinx.Graphics.GAL\Ryujinx.Graphics.GAL.csproj" />
diff --git a/Ryujinx.Graphics.OpenGL/Window.cs b/Ryujinx.Graphics.OpenGL/Window.cs
index 8f7917f91f..d6606f3925 100644
--- a/Ryujinx.Graphics.OpenGL/Window.cs
+++ b/Ryujinx.Graphics.OpenGL/Window.cs
@@ -1,5 +1,7 @@
 using OpenTK.Graphics.OpenGL;
 using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.OpenGL.Effects;
+using Ryujinx.Graphics.OpenGL.Effects.Smaa;
 using Ryujinx.Graphics.OpenGL.Image;
 using System;
 
@@ -7,14 +9,24 @@ namespace Ryujinx.Graphics.OpenGL
 {
     class Window : IWindow, IDisposable
     {
-        private const int TextureCount = 3;
         private readonly OpenGLRenderer _renderer;
 
         private bool _initialized;
 
         private int _width;
         private int _height;
+        private bool _updateSize;
         private int _copyFramebufferHandle;
+        private IPostProcessingEffect _antiAliasing;
+        private IScalingFilter _scalingFilter;
+        private bool _isLinear;
+        private AntiAliasing _currentAntiAliasing;
+        private bool _updateEffect;
+        private ScalingFilter _currentScalingFilter;
+        private float _scalingFilterLevel;
+        private bool _updateScalingFilter;
+        private bool _isBgra;
+        private TextureView _upscaledTexture;
 
         internal BackgroundContextWorker BackgroundContext { get; private set; }
 
@@ -48,6 +60,8 @@ namespace Ryujinx.Graphics.OpenGL
         {
             _width = width;
             _height = height;
+
+            _updateSize = true;
         }
 
         private void CopyTextureToFrameBufferRGB(int drawFramebuffer, int readFramebuffer, TextureView view, ImageCrop crop, Action swapBuffersCallback)
@@ -57,6 +71,32 @@ namespace Ryujinx.Graphics.OpenGL
 
             TextureView viewConverted = view.Format.IsBgr() ? _renderer.TextureCopy.BgraSwap(view) : view;
 
+            UpdateEffect();
+
+            if (_antiAliasing != null)
+            {
+                var oldView = viewConverted;
+
+                viewConverted = _antiAliasing.Run(viewConverted, _width, _height);
+
+                if (viewConverted.Format.IsBgr())
+                {
+                    var swappedView = _renderer.TextureCopy.BgraSwap(viewConverted);
+
+                    viewConverted?.Dispose();
+
+                    viewConverted = swappedView;
+                }
+
+                if (viewConverted != oldView && oldView != view)
+                {
+                    oldView.Dispose();
+                }
+            }
+            
+            GL.BindFramebuffer(FramebufferTarget.DrawFramebuffer, drawFramebuffer);
+            GL.BindFramebuffer(FramebufferTarget.ReadFramebuffer, readFramebuffer);
+
             GL.FramebufferTexture(
                 FramebufferTarget.ReadFramebuffer,
                 FramebufferAttachment.ColorAttachment0,
@@ -71,12 +111,12 @@ namespace Ryujinx.Graphics.OpenGL
             GL.Clear(ClearBufferMask.ColorBufferBit);
 
             int srcX0, srcX1, srcY0, srcY1;
-            float scale = view.ScaleFactor;
+            float scale = viewConverted.ScaleFactor;
 
             if (crop.Left == 0 && crop.Right == 0)
             {
                 srcX0 = 0;
-                srcX1 = (int)(view.Width / scale);
+                srcX1 = (int)(viewConverted.Width / scale);
             }
             else
             {
@@ -87,7 +127,7 @@ namespace Ryujinx.Graphics.OpenGL
             if (crop.Top == 0 && crop.Bottom == 0)
             {
                 srcY0 = 0;
-                srcY1 = (int)(view.Height / scale);
+                srcY1 = (int)(viewConverted.Height / scale);
             }
             else
             {
@@ -125,6 +165,42 @@ namespace Ryujinx.Graphics.OpenGL
                 ScreenCaptureRequested = false;
             }
 
+            if (_scalingFilter != null)
+            {
+                if (viewConverted.Format.IsBgr() && !_isBgra)
+                {
+                    RecreateUpscalingTexture(true);
+                }
+
+                _scalingFilter.Run(
+                    viewConverted,
+                    _upscaledTexture,
+                    _width,
+                    _height,
+                    new Extents2D(
+                        srcX0,
+                        srcY0,
+                        srcX1,
+                        srcY1),
+                    new Extents2D(
+                        dstX0,
+                        dstY0,
+                        dstX1,
+                        dstY1)
+                    );
+
+                srcX0 = dstX0;
+                srcY0 = dstY0;
+                srcX1 = dstX1;
+                srcY1 = dstY1;
+
+                GL.FramebufferTexture(
+                    FramebufferTarget.ReadFramebuffer,
+                    FramebufferAttachment.ColorAttachment0,
+                    _upscaledTexture.Handle,
+                    0);
+            }
+
             GL.BlitFramebuffer(
                 srcX0,
                 srcY0,
@@ -135,7 +211,7 @@ namespace Ryujinx.Graphics.OpenGL
                 dstX1,
                 dstY1,
                 ClearBufferMask.ColorBufferBit,
-                BlitFramebufferFilter.Linear);
+                _isLinear ? BlitFramebufferFilter.Linear : BlitFramebufferFilter.Nearest);
 
             // Remove Alpha channel
             GL.ColorMask(false, false, false, true);
@@ -209,6 +285,135 @@ namespace Ryujinx.Graphics.OpenGL
 
                 _copyFramebufferHandle = 0;
             }
+
+            _antiAliasing?.Dispose();
+            _scalingFilter?.Dispose();
+            _upscaledTexture?.Dispose();
+        }
+
+        public void SetAntiAliasing(AntiAliasing effect)
+        {
+            if (_currentAntiAliasing == effect && _antiAliasing != null)
+            {
+                return;
+            }
+
+            _currentAntiAliasing = effect;
+
+            _updateEffect = true;
+        }
+
+        public void SetScalingFilter(ScalingFilter type)
+        {
+            if (_currentScalingFilter == type && _antiAliasing != null)
+            {
+                return;
+            }
+
+            _currentScalingFilter = type;
+
+            _updateScalingFilter = true;
+        }
+
+        private void UpdateEffect()
+        {
+            if (_updateEffect)
+            {
+                _updateEffect = false;
+
+                switch (_currentAntiAliasing)
+                {
+                    case AntiAliasing.Fxaa:
+                        _antiAliasing?.Dispose();
+                        _antiAliasing = new FxaaPostProcessingEffect(_renderer);
+                        break;
+                    case AntiAliasing.None:
+                        _antiAliasing?.Dispose();
+                        _antiAliasing = null;
+                        break;
+                    case AntiAliasing.SmaaLow:
+                    case AntiAliasing.SmaaMedium:
+                    case AntiAliasing.SmaaHigh:
+                    case AntiAliasing.SmaaUltra:
+                        var quality = _currentAntiAliasing - AntiAliasing.SmaaLow;
+                        if (_antiAliasing is SmaaPostProcessingEffect smaa)
+                        {
+                            smaa.Quality = quality;
+                        }
+                        else
+                        {
+                            _antiAliasing?.Dispose();
+                            _antiAliasing = new SmaaPostProcessingEffect(_renderer, quality);
+                        }
+                        break;
+                }
+            }
+
+            if (_updateSize && !_updateScalingFilter)
+            {
+                RecreateUpscalingTexture();
+            }
+
+            _updateSize = false;
+
+            if (_updateScalingFilter)
+            {
+                _updateScalingFilter = false;
+
+                switch (_currentScalingFilter)
+                {
+                    case ScalingFilter.Bilinear:
+                    case ScalingFilter.Nearest:
+                        _scalingFilter?.Dispose();
+                        _scalingFilter = null;
+                        _isLinear = _currentScalingFilter == ScalingFilter.Bilinear;
+                        _upscaledTexture?.Dispose();
+                        _upscaledTexture = null;
+                        break;
+                    case ScalingFilter.Fsr:
+                        if (_scalingFilter is not FsrScalingFilter)
+                        {
+                            _scalingFilter?.Dispose();
+                            _scalingFilter = new FsrScalingFilter(_renderer, _antiAliasing);
+                        }
+                        _isLinear = false;
+                        _scalingFilter.Level = _scalingFilterLevel;
+
+                        RecreateUpscalingTexture();
+                        break;
+                }
+            }
+        }
+
+        private void RecreateUpscalingTexture(bool forceBgra = false)
+        {
+            _upscaledTexture?.Dispose();
+
+            var info = new TextureCreateInfo(
+                _width,
+                _height,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                Format.R8G8B8A8Unorm,
+                DepthStencilMode.Depth,
+                Target.Texture2D,
+                forceBgra ? SwizzleComponent.Blue : SwizzleComponent.Red,
+                SwizzleComponent.Green,
+                forceBgra ? SwizzleComponent.Red : SwizzleComponent.Blue,
+                SwizzleComponent.Alpha);
+
+            _isBgra = forceBgra;
+            _upscaledTexture = _renderer.CreateTexture(info, 1) as TextureView;
+        }
+
+        public void SetScalingFilterLevel(float level)
+        {
+            _scalingFilterLevel = level;
+            _updateScalingFilter = true;
         }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs b/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs
index 9ac2e61de0..19a085023b 100644
--- a/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs
+++ b/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs
@@ -163,6 +163,13 @@ namespace Ryujinx.Graphics.Vulkan
             SignalDirty(DirtyFlags.Image);
         }
 
+        public void SetImage(int binding, Auto<DisposableImageView> image)
+        {
+            _imageRefs[binding] = image;
+
+            SignalDirty(DirtyFlags.Image);
+        }
+
         public void SetStorageBuffers(CommandBuffer commandBuffer, ReadOnlySpan<BufferAssignment> buffers)
         {
             for (int i = 0; i < buffers.Length; i++)
diff --git a/Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs b/Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs
new file mode 100644
index 0000000000..a12070592f
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs
@@ -0,0 +1,208 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.Shader;
+using Ryujinx.Graphics.Shader.Translation;
+using Silk.NET.Vulkan;
+using System;
+using Extent2D = Ryujinx.Graphics.GAL.Extents2D;
+
+namespace Ryujinx.Graphics.Vulkan.Effects
+{
+    internal partial class FsrScalingFilter : IScalingFilter
+    {
+        private readonly VulkanRenderer _renderer;
+        private PipelineHelperShader _pipeline;
+        private ISampler _sampler;
+        private ShaderCollection _scalingProgram;
+        private ShaderCollection _sharpeningProgram;
+        private float _sharpeningLevel = 1;
+        private Device _device;
+        private TextureView _intermediaryTexture;
+
+        public float Level
+        {
+            get => _sharpeningLevel;
+            set
+            {
+                _sharpeningLevel = MathF.Max(0.01f, value);
+            }
+        }
+
+        public FsrScalingFilter(VulkanRenderer renderer, Device device)
+        {
+            _device = device;
+            _renderer = renderer;
+
+            Initialize();
+        }
+
+        public void Dispose()
+        {
+            _pipeline.Dispose();
+            _scalingProgram.Dispose();
+            _sharpeningProgram.Dispose();
+            _sampler.Dispose();
+            _intermediaryTexture?.Dispose();
+        }
+
+        public void Initialize()
+        {
+            _pipeline = new PipelineHelperShader(_renderer, _device);
+
+            _pipeline.Initialize();
+
+            var scalingShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv");
+            var sharpeningShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv");
+
+            var computeBindings = new ShaderBindings(
+                new[] { 2 },
+                Array.Empty<int>(),
+                new[] { 1 },
+                new[] { 0 });
+
+            var sharpeningBindings = new ShaderBindings(
+                new[] { 2, 3, 4 },
+                Array.Empty<int>(),
+                new[] { 1 },
+                new[] { 0 });
+
+            _sampler = _renderer.CreateSampler(GAL.SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear));
+
+            _scalingProgram = _renderer.CreateProgramWithMinimalLayout(new[]
+            {
+                new ShaderSource(scalingShader, computeBindings, ShaderStage.Compute, TargetLanguage.Spirv)
+            });
+
+            _sharpeningProgram = _renderer.CreateProgramWithMinimalLayout(new[]
+            {
+                new ShaderSource(sharpeningShader, sharpeningBindings, ShaderStage.Compute, TargetLanguage.Spirv)
+            });
+        }
+
+        public void Run(
+            TextureView view,
+            CommandBufferScoped cbs,
+            Auto<DisposableImageView> destinationTexture,
+            Silk.NET.Vulkan.Format format,
+            int width,
+            int height,
+            Extent2D source,
+            Extent2D destination)
+        {
+            if (_intermediaryTexture == null
+                || _intermediaryTexture.Info.Width != width
+                || _intermediaryTexture.Info.Height != height
+                || !_intermediaryTexture.Info.Equals(view.Info))
+            {
+                var originalInfo = view.Info;
+
+                var swapRB = originalInfo.Format.IsBgr() && originalInfo.SwizzleR == SwizzleComponent.Red;
+
+                var info = new TextureCreateInfo(
+                    width,
+                    height,
+                    originalInfo.Depth,
+                    originalInfo.Levels,
+                    originalInfo.Samples,
+                    originalInfo.BlockWidth,
+                    originalInfo.BlockHeight,
+                    originalInfo.BytesPerPixel,
+                    originalInfo.Format,
+                    originalInfo.DepthStencilMode,
+                    originalInfo.Target,
+                    swapRB ? originalInfo.SwizzleB : originalInfo.SwizzleR,
+                    originalInfo.SwizzleG,
+                    swapRB ? originalInfo.SwizzleR : originalInfo.SwizzleB,
+                    originalInfo.SwizzleA);
+                _intermediaryTexture?.Dispose();
+                _intermediaryTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView;
+            }
+
+            Span<GAL.Viewport> viewports = stackalloc GAL.Viewport[1];
+            Span<Rectangle<int>> scissors = stackalloc Rectangle<int>[1];
+
+            viewports[0] = new GAL.Viewport(
+                new Rectangle<float>(0, 0, view.Width, view.Height),
+                ViewportSwizzle.PositiveX,
+                ViewportSwizzle.PositiveY,
+                ViewportSwizzle.PositiveZ,
+                ViewportSwizzle.PositiveW,
+                0f,
+                1f);
+
+            scissors[0] = new Rectangle<int>(0, 0, view.Width, view.Height);
+
+            _pipeline.SetCommandBuffer(cbs);
+            _pipeline.SetProgram(_scalingProgram);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _sampler);
+
+            float srcWidth = Math.Abs(source.X2 - source.X1);
+            float srcHeight = Math.Abs(source.Y2 - source.Y1);
+            float scaleX = srcWidth / view.Width;
+            float scaleY = srcHeight / view.Height;
+
+            ReadOnlySpan<float> dimensionsBuffer = stackalloc float[]
+            {
+                source.X1,
+                source.X2,
+                source.Y1,
+                source.Y2,
+                destination.X1,
+                destination.X2,
+                destination.Y1,
+                destination.Y2,
+                scaleX,
+                scaleY
+            };
+
+            int rangeSize = dimensionsBuffer.Length * sizeof(float);
+            var bufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, rangeSize, false);
+            _renderer.BufferManager.SetData(bufferHandle, 0, dimensionsBuffer);
+
+            ReadOnlySpan<float> sharpeningBuffer = stackalloc float[] { 1.5f - (Level * 0.01f * 1.5f)};
+            var sharpeningBufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, sizeof(float), false);
+            _renderer.BufferManager.SetData(sharpeningBufferHandle, 0, sharpeningBuffer);
+
+            int threadGroupWorkRegionDim = 16;
+            int dispatchX = (width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim;
+            int dispatchY = (height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim;
+
+            var bufferRanges = new BufferRange(bufferHandle, 0, rangeSize);
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) });
+            _pipeline.SetScissors(scissors);
+            _pipeline.SetViewports(viewports, false);
+            _pipeline.SetImage(0, _intermediaryTexture, GAL.Format.R8G8B8A8Unorm);
+            _pipeline.DispatchCompute(dispatchX, dispatchY, 1);
+            _pipeline.ComputeBarrier();
+
+            viewports[0] = new GAL.Viewport(
+                new Rectangle<float>(0, 0, width, height),
+                ViewportSwizzle.PositiveX,
+                ViewportSwizzle.PositiveY,
+                ViewportSwizzle.PositiveZ,
+                ViewportSwizzle.PositiveW,
+                0f,
+                1f);
+
+            scissors[0] = new Rectangle<int>(0, 0, width, height);
+
+            // Sharpening pass
+            _pipeline.SetCommandBuffer(cbs);
+            _pipeline.SetProgram(_sharpeningProgram);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, _intermediaryTexture, _sampler);
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) });
+            var sharpeningRange = new BufferRange(sharpeningBufferHandle, 0, sizeof(float));
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(4, sharpeningRange) });
+            _pipeline.SetScissors(scissors);
+            _pipeline.SetViewports(viewports, false);
+            _pipeline.SetImage(0, destinationTexture);
+            _pipeline.DispatchCompute(dispatchX, dispatchY, 1);
+            _pipeline.ComputeBarrier();
+
+            _pipeline.Finish();
+
+            _renderer.BufferManager.Delete(bufferHandle);
+            _renderer.BufferManager.Delete(sharpeningBufferHandle);
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs b/Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs
new file mode 100644
index 0000000000..0f6a0a7baf
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs
@@ -0,0 +1,127 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.Shader;
+using Ryujinx.Graphics.Shader.Translation;
+using Silk.NET.Vulkan;
+using System;
+
+namespace Ryujinx.Graphics.Vulkan.Effects
+{
+    internal partial class FxaaPostProcessingEffect : IPostProcessingEffect
+    {
+        private readonly VulkanRenderer _renderer;
+        private ISampler _samplerLinear;
+        private ShaderCollection _shaderProgram;
+
+        private PipelineHelperShader _pipeline;
+        private TextureView _texture;
+
+        public FxaaPostProcessingEffect(VulkanRenderer renderer, Device device)
+        {
+            _renderer = renderer;
+            _pipeline = new PipelineHelperShader(renderer, device);
+
+            Initialize();
+        }
+
+        public void Dispose()
+        {
+            _shaderProgram.Dispose();
+            _pipeline.Dispose();
+            _samplerLinear.Dispose();
+            _texture?.Dispose();
+        }
+
+        private void Initialize()
+        {
+            _pipeline.Initialize();
+
+            var shader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv");
+
+            var computeBindings = new ShaderBindings(
+                new[] { 2 },
+                Array.Empty<int>(),
+                new[] { 1 },
+                new[] { 0 });
+
+            _samplerLinear = _renderer.CreateSampler(GAL.SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear));
+
+            _shaderProgram = _renderer.CreateProgramWithMinimalLayout(new[]
+            {
+                new ShaderSource(shader, computeBindings, ShaderStage.Compute, TargetLanguage.Spirv)
+            });
+        }
+
+        public TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height)
+        {
+            if (_texture == null || _texture.Width != view.Width || _texture.Height != view.Height)
+            {
+                _texture?.Dispose();
+
+                var info = view.Info;
+
+                if (view.Info.Format.IsBgr())
+                {
+                    info = new TextureCreateInfo(info.Width,
+                        info.Height,
+                        info.Depth,
+                        info.Levels,
+                        info.Samples,
+                        info.BlockWidth,
+                        info.BlockHeight,
+                        info.BytesPerPixel,
+                        info.Format,
+                        info.DepthStencilMode,
+                        info.Target,
+                        info.SwizzleB,
+                        info.SwizzleG,
+                        info.SwizzleR,
+                        info.SwizzleA);
+                }
+                _texture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView;
+            }
+
+            _pipeline.SetCommandBuffer(cbs);
+            _pipeline.SetProgram(_shaderProgram);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear);
+
+            ReadOnlySpan<float> resolutionBuffer = stackalloc float[] { view.Width, view.Height };
+            int rangeSize = resolutionBuffer.Length * sizeof(float);
+            var bufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, rangeSize, false);
+
+            _renderer.BufferManager.SetData(bufferHandle, 0, resolutionBuffer);
+
+            var bufferRanges = new BufferRange(bufferHandle, 0, rangeSize);
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) });
+
+            Span<GAL.Viewport> viewports = stackalloc GAL.Viewport[1];
+
+            viewports[0] = new GAL.Viewport(
+                new Rectangle<float>(0, 0, view.Width, view.Height),
+                ViewportSwizzle.PositiveX,
+                ViewportSwizzle.PositiveY,
+                ViewportSwizzle.PositiveZ,
+                ViewportSwizzle.PositiveW,
+                0f,
+                1f);
+
+            Span<Rectangle<int>> scissors = stackalloc Rectangle<int>[1];
+
+            var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize);
+            var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize);
+
+            _pipeline.SetScissors(stackalloc[] { new Rectangle<int>(0, 0, view.Width, view.Height) });
+            _pipeline.SetViewports(viewports, false);
+
+            _pipeline.SetImage(0, _texture, GAL.Format.R8G8B8A8Unorm);
+            _pipeline.DispatchCompute(dispatchX, dispatchY, 1);
+
+            _renderer.BufferManager.Delete(bufferHandle);
+            _pipeline.ComputeBarrier();
+
+            _pipeline.Finish();
+
+            return _texture;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs b/Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs
new file mode 100644
index 0000000000..d36cf01d4e
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs
@@ -0,0 +1,10 @@
+using System;
+
+namespace Ryujinx.Graphics.Vulkan.Effects
+{
+    internal interface IPostProcessingEffect : IDisposable
+    {
+        const int LocalGroupSize = 64;
+        TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height);
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs b/Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs
new file mode 100644
index 0000000000..54f809d715
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs
@@ -0,0 +1,20 @@
+using Silk.NET.Vulkan;
+using System;
+using Extent2D = Ryujinx.Graphics.GAL.Extents2D;
+
+namespace Ryujinx.Graphics.Vulkan.Effects
+{
+    internal interface IScalingFilter : IDisposable
+    {
+        float Level { get; set; }
+        void Run(
+            TextureView view,
+            CommandBufferScoped cbs,
+            Auto<DisposableImageView> destinationTexture,
+            Format format,
+            int width,
+            int height,
+            Extent2D source,
+            Extent2D destination);
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl
new file mode 100644
index 0000000000..5eb74b3d13
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl
@@ -0,0 +1,3945 @@
+// Scaling
+
+#version 430 core
+layout (local_size_x = 64) in;
+layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput;
+layout( binding = 1, set = 2) uniform sampler2D Source;
+layout( binding = 2 ) uniform dimensions{
+ float srcX0;
+ float srcX1;
+ float srcY0;
+ float srcY1;
+ float dstX0;
+ float dstX1;
+ float dstY0;
+ float dstY1;
+ float scaleX;
+ float scaleY;
+};
+
+#define A_GPU 1
+#define A_GLSL 1
+//==============================================================================================================================
+//
+//                                               [A] SHADER PORTABILITY 1.20210629
+//
+//==============================================================================================================================
+// FidelityFX Super Resolution Sample
+//
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// MIT LICENSE
+// ===========
+// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
+// -----------
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// -----------
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+// Software.
+// -----------
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// Common central point for high-level shading language and C portability for various shader headers.
+//------------------------------------------------------------------------------------------------------------------------------
+// DEFINES
+// =======
+// A_CPU ..... Include the CPU related code.
+// A_GPU ..... Include the GPU related code.
+// A_GLSL .... Using GLSL.
+// A_HLSL .... Using HLSL.
+// A_HLSL_6_2  Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
+// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
+// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
+// =======
+// A_BYTE .... Support 8-bit integer.
+// A_HALF .... Support 16-bit integer and floating point.
+// A_LONG .... Support 64-bit integer.
+// A_DUBL .... Support 64-bit floating point.
+// =======
+// A_WAVE .... Support wave-wide operations.
+//------------------------------------------------------------------------------------------------------------------------------
+// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
+//------------------------------------------------------------------------------------------------------------------------------
+// SIMPLIFIED TYPE SYSTEM
+// ======================
+//  - All ints will be unsigned with exception of when signed is required.
+//  - Type naming simplified and shortened "A<type><#components>",
+//     - H = 16-bit float (half)
+//     - F = 32-bit float (float)
+//     - D = 64-bit float (double)
+//     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
+//     - B = 8-bit integer (byte)
+//     - W = 16-bit integer (word)
+//     - U = 32-bit integer (unsigned)
+//     - L = 64-bit integer (long)
+//  - Using "AS<type><#components>" for signed when required.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
+//------------------------------------------------------------------------------------------------------------------------------
+// CHANGE LOG
+// ==========
+// 20200914 - Expanded wave ops and prx code.
+// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                           COMMON
+//==============================================================================================================================
+#define A_2PI 6.28318530718
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                             CPU
+//
+//
+//==============================================================================================================================
+#ifdef A_CPU
+ // Supporting user defined overrides.
+ #ifndef A_RESTRICT
+  #define A_RESTRICT __restrict
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifndef A_STATIC
+  #define A_STATIC static
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ // Same types across CPU and GPU.
+ // Predicate uses 32-bit integer (C friendly bool).
+ typedef uint32_t AP1;
+ typedef float AF1;
+ typedef double AD1;
+ typedef uint8_t AB1;
+ typedef uint16_t AW1;
+ typedef uint32_t AU1;
+ typedef uint64_t AL1;
+ typedef int8_t ASB1;
+ typedef int16_t ASW1;
+ typedef int32_t ASU1;
+ typedef int64_t ASL1;
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AD1_(a) ((AD1)(a))
+ #define AF1_(a) ((AF1)(a))
+ #define AL1_(a) ((AL1)(a))
+ #define AU1_(a) ((AU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASL1_(a) ((ASL1)(a))
+ #define ASU1_(a) ((ASU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
+//------------------------------------------------------------------------------------------------------------------------------
+ #define A_TRUE 1
+ #define A_FALSE 0
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                       CPU/GPU PORTING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Get CPU and GPU to share all setup code, without duplicate code paths.
+// This uses a lower-case prefix for special vector constructs.
+//  - In C restrict pointers are used.
+//  - In the shading language, in/inout/out arguments are used.
+// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD1 *A_RESTRICT
+ #define retAD3 AD1 *A_RESTRICT
+ #define retAD4 AD1 *A_RESTRICT
+ #define retAF2 AF1 *A_RESTRICT
+ #define retAF3 AF1 *A_RESTRICT
+ #define retAF4 AF1 *A_RESTRICT
+ #define retAL2 AL1 *A_RESTRICT
+ #define retAL3 AL1 *A_RESTRICT
+ #define retAL4 AL1 *A_RESTRICT
+ #define retAU2 AU1 *A_RESTRICT
+ #define retAU3 AU1 *A_RESTRICT
+ #define retAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 AD1 *A_RESTRICT
+ #define inAD3 AD1 *A_RESTRICT
+ #define inAD4 AD1 *A_RESTRICT
+ #define inAF2 AF1 *A_RESTRICT
+ #define inAF3 AF1 *A_RESTRICT
+ #define inAF4 AF1 *A_RESTRICT
+ #define inAL2 AL1 *A_RESTRICT
+ #define inAL3 AL1 *A_RESTRICT
+ #define inAL4 AL1 *A_RESTRICT
+ #define inAU2 AU1 *A_RESTRICT
+ #define inAU3 AU1 *A_RESTRICT
+ #define inAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 AD1 *A_RESTRICT
+ #define inoutAD3 AD1 *A_RESTRICT
+ #define inoutAD4 AD1 *A_RESTRICT
+ #define inoutAF2 AF1 *A_RESTRICT
+ #define inoutAF3 AF1 *A_RESTRICT
+ #define inoutAF4 AF1 *A_RESTRICT
+ #define inoutAL2 AL1 *A_RESTRICT
+ #define inoutAL3 AL1 *A_RESTRICT
+ #define inoutAL4 AL1 *A_RESTRICT
+ #define inoutAU2 AU1 *A_RESTRICT
+ #define inoutAU3 AU1 *A_RESTRICT
+ #define inoutAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 AD1 *A_RESTRICT
+ #define outAD3 AD1 *A_RESTRICT
+ #define outAD4 AD1 *A_RESTRICT
+ #define outAF2 AF1 *A_RESTRICT
+ #define outAF3 AF1 *A_RESTRICT
+ #define outAF4 AF1 *A_RESTRICT
+ #define outAL2 AL1 *A_RESTRICT
+ #define outAL3 AL1 *A_RESTRICT
+ #define outAL4 AL1 *A_RESTRICT
+ #define outAU2 AU1 *A_RESTRICT
+ #define outAU3 AU1 *A_RESTRICT
+ #define outAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD1 x[2]
+ #define varAD3(x) AD1 x[3]
+ #define varAD4(x) AD1 x[4]
+ #define varAF2(x) AF1 x[2]
+ #define varAF3(x) AF1 x[3]
+ #define varAF4(x) AF1 x[4]
+ #define varAL2(x) AL1 x[2]
+ #define varAL3(x) AL1 x[3]
+ #define varAL4(x) AL1 x[4]
+ #define varAU2(x) AU1 x[2]
+ #define varAU3(x) AU1 x[3]
+ #define varAU4(x) AU1 x[4]
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) {x,y}
+ #define initAD3(x,y,z) {x,y,z}
+ #define initAD4(x,y,z,w) {x,y,z,w}
+ #define initAF2(x,y) {x,y}
+ #define initAF3(x,y,z) {x,y,z}
+ #define initAF4(x,y,z,w) {x,y,z,w}
+ #define initAL2(x,y) {x,y}
+ #define initAL3(x,y,z) {x,y,z}
+ #define initAL4(x,y,z,w) {x,y,z,w}
+ #define initAU2(x,y) {x,y}
+ #define initAU3(x,y,z) {x,y,z}
+ #define initAU4(x,y,z,w) {x,y,z,w}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Replace transcendentals with manual versions. 
+//==============================================================================================================================
+ #ifdef A_GCC
+  A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
+ #else
+  A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
+ #else
+  A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+ A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
+ #else
+  A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
+ #else
+  A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
+ A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
+ #else
+  A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
+ A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
+ A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
+ A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // These follow the convention that A integer types don't have signage, until they are operated on. 
+ A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
+ A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
+ A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
+ A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
+ A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
+ A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
+ A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
+ A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
+ #else
+  A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
+ #else
+  A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
+ A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
+ A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
+ A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
+ A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
+ A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
+ A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
+ A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
+ A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
+ A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
+ A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
+ A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
+ A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
+ A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
+ A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
+ A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
+ A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
+ A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
+ A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
+ A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
+ A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
+ A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF FLOAT PACKING
+//==============================================================================================================================
+ // Convert float to half (in lower 16-bits of output).
+ // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+ // Supports denormals.
+ // Conversion rules are to make computations possibly "safer" on the GPU,
+ //  -INF & -NaN -> -65504
+ //  +INF & +NaN -> +65504
+ A_STATIC AU1 AU1_AH1_AF1(AF1 f){
+  static AW1 base[512]={
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
+   0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
+   0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
+   0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
+   0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
+  static AB1 shift[512]={
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
+  union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Used to output packed constant.
+ A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            GLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_GLSL) && defined(A_GPU)
+ #ifndef A_SKIP_EXT
+  #ifdef A_HALF
+   #extension GL_EXT_shader_16bit_storage:require
+   #extension GL_EXT_shader_explicit_arithmetic_types:require 
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_LONG
+   #extension GL_ARB_gpu_shader_int64:require
+   #extension GL_NV_shader_atomic_int64:require
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_WAVE
+   #extension GL_KHR_shader_subgroup_arithmetic:require
+   #extension GL_KHR_shader_subgroup_ballot:require
+   #extension GL_KHR_shader_subgroup_quad:require
+   #extension GL_KHR_shader_subgroup_shuffle:require
+  #endif
+ #endif
+//==============================================================================================================================
+ #define AP1 bool
+ #define AP2 bvec2
+ #define AP3 bvec3
+ #define AP4 bvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF1 float
+ #define AF2 vec2
+ #define AF3 vec3
+ #define AF4 vec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1 uint
+ #define AU2 uvec2
+ #define AU3 uvec3
+ #define AU4 uvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASU1 int
+ #define ASU2 ivec2
+ #define ASU3 ivec3
+ #define ASU4 ivec4
+//==============================================================================================================================
+ #define AF1_AU1(x) uintBitsToFloat(AU1(x))
+ #define AF2_AU2(x) uintBitsToFloat(AU2(x))
+ #define AF3_AU3(x) uintBitsToFloat(AU3(x))
+ #define AF4_AU4(x) uintBitsToFloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) floatBitsToUint(AF1(x))
+ #define AU2_AF2(x) floatBitsToUint(AF2(x))
+ #define AU3_AF3(x) floatBitsToUint(AF3(x))
+ #define AU4_AF4(x) floatBitsToUint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
+ #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AH2_AF2 packHalf2x16
+ #define AU1_AW2Unorm_AF2 packUnorm2x16
+ #define AU1_AB4Unorm_AF4 packUnorm4x8
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF2_AH2_AU1 unpackHalf2x16
+ #define AF2_AW2Unorm_AU1 unpackUnorm2x16
+ #define AF4_AB4Unorm_AU1 unpackUnorm4x8
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MED3_F32.
+ AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
+ AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
+ AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
+ AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F32 (note DX frac() is different).
+ AF1 AFractF1(AF1 x){return fract(x);}
+ AF2 AFractF2(AF2 x){return fract(x);}
+ AF3 AFractF3(AF3 x){return fract(x);}
+ AF4 AFractF4(AF4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MAX3_F32.
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Clamp has an easier pattern match for med3 when some ordering is known.
+ // V_MED3_F32.
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MIN3_F32.
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_COS_F32.
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_SIN_F32.
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
+ AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
+ AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
+ AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
+ AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
+ AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
+ AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
+ AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
+ AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
+ AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+  #define AB1 uint8_t
+  #define AB2 u8vec2
+  #define AB3 u8vec3
+  #define AB4 u8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASB1 int8_t
+  #define ASB2 i8vec2
+  #define ASB3 i8vec3
+  #define ASB4 i8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AB1 AB1_x(AB1 a){return AB1(a);}
+  AB2 AB2_x(AB1 a){return AB2(a,a);}
+  AB3 AB3_x(AB1 a){return AB3(a,a,a);}
+  AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
+  #define AB1_(a) AB1_x(AB1(a))
+  #define AB2_(a) AB2_x(AB1(a))
+  #define AB3_(a) AB3_x(AB1(a))
+  #define AB4_(a) AB4_x(AB1(a))
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #define AH1 float16_t
+  #define AH2 f16vec2
+  #define AH3 f16vec3
+  #define AH4 f16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AW1 uint16_t
+  #define AW2 u16vec2
+  #define AW3 u16vec3
+  #define AW4 u16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASW1 int16_t
+  #define ASW2 i16vec2
+  #define ASW3 i16vec3
+  #define ASW4 i16vec4
+//==============================================================================================================================
+  #define AH2_AU1(x) unpackFloat2x16(AU1(x))
+  AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) unpackUint2x16(AU1(x))
+  #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1_AH2(x) packFloat2x16(AH2(x))
+  AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) packUint2x16(AW2(x))
+  #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
+//==============================================================================================================================
+  #define AW1_AH1(x) halfBitsToUint16(AH1(x))
+  #define AW2_AH2(x) halfBitsToUint16(AH2(x))
+  #define AW3_AH3(x) halfBitsToUint16(AH3(x))
+  #define AW4_AH4(x) halfBitsToUint16(AH4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
+  #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
+  #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
+  #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
+  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
+  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
+  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFractH1(AH1 x){return fract(x);}
+  AH2 AFractH2(AH2 x){return fract(x);}
+  AH3 AFractH3(AH3 x){return fract(x);}
+  AH4 AFractH4(AH4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of max3.
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of min3.
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
+  AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
+  AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
+  AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
+  AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
+  AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
+  AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
+  AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
+  AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
+  AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #define AD1 double
+  #define AD2 dvec2
+  #define AD3 dvec3
+  #define AD4 dvec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 x){return fract(x);}
+  AD2 AFractD2(AD2 x){return fract(x);}
+  AD3 AFractD3(AD3 x){return fract(x);}
+  AD4 AFractD4(AD4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
+  AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
+  AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
+  AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
+  AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
+  AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
+  AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
+  AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
+  AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
+  AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL LONG
+//==============================================================================================================================
+ #ifdef A_LONG
+  #define AL1 uint64_t
+  #define AL2 u64vec2
+  #define AL3 u64vec3
+  #define AL4 u64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASL1 int64_t
+  #define ASL2 i64vec2
+  #define ASL3 i64vec3
+  #define ASL4 i64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AL1_AU2(x) packUint2x32(AU2(x))
+  #define AU2_AL1(x) unpackUint2x32(AL1(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AL1_x(AL1 a){return AL1(a);}
+  AL2 AL2_x(AL1 a){return AL2(a,a);}
+  AL3 AL3_x(AL1 a){return AL3(a,a,a);}
+  AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
+  #define AL1_(a) AL1_x(AL1(a))
+  #define AL2_(a) AL2_x(AL1(a))
+  #define AL3_(a) AL3_x(AL1(a))
+  #define AL4_(a) AL4_x(AL1(a))
+//==============================================================================================================================
+  AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
+  AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
+  AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
+  AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
+  AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
+  AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
+  AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
+  AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
+  AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
+  AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      WAVE OPERATIONS
+//==============================================================================================================================
+ #ifdef A_WAVE
+  // Where 'x' must be a compile time literal.
+  AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_HALF
+   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
+   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
+   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
+   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
+  #endif
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            HLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_HLSL) && defined(A_GPU)
+ #ifdef A_HLSL_6_2
+  #define AP1 bool
+  #define AP2 bool2
+  #define AP3 bool3
+  #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AF1 float32_t
+  #define AF2 float32_t2
+  #define AF3 float32_t3
+  #define AF4 float32_t4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1 uint32_t
+  #define AU2 uint32_t2
+  #define AU3 uint32_t3
+  #define AU4 uint32_t4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASU1 int32_t
+  #define ASU2 int32_t2
+  #define ASU3 int32_t3
+  #define ASU4 int32_t4
+ #else
+  #define AP1 bool
+  #define AP2 bool2
+  #define AP3 bool3
+  #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AF1 float
+  #define AF2 float2
+  #define AF3 float3
+  #define AF4 float4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1 uint
+  #define AU2 uint2
+  #define AU3 uint3
+  #define AU4 uint4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASU1 int
+  #define ASU2 int2
+  #define ASU3 int3
+  #define ASU4 int4
+ #endif
+//==============================================================================================================================
+ #define AF1_AU1(x) asfloat(AU1(x))
+ #define AF2_AU2(x) asfloat(AU2(x))
+ #define AF3_AU3(x) asfloat(AU3(x))
+ #define AF4_AU4(x) asfloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) asuint(AF1(x))
+ #define AU2_AF2(x) asuint(AF2(x))
+ #define AU3_AF3(x) asuint(AF3(x))
+ #define AU4_AF4(x) asuint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
+ #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
+ #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
+ #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
+ #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
+ AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
+ AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
+ AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFractF1(AF1 x){return x-floor(x);}
+ AF2 AFractF2(AF2 x){return x-floor(x);}
+ AF3 AFractF3(AF3 x){return x-floor(x);}
+ AF4 AFractF4(AF4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return rcp(x);}
+ AF2 ARcpF2(AF2 x){return rcp(x);}
+ AF3 ARcpF3(AF3 x){return rcp(x);}
+ AF4 ARcpF4(AF4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return rsqrt(x);}
+ AF2 ARsqF2(AF2 x){return rsqrt(x);}
+ AF3 ARsqF3(AF3 x){return rsqrt(x);}
+ AF4 ARsqF4(AF4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return saturate(x);}
+ AF2 ASatF2(AF2 x){return saturate(x);}
+ AF3 ASatF3(AF3 x){return saturate(x);}
+ AF4 ASatF4(AF4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #ifdef A_HLSL_6_2
+   #define AH1 float16_t
+   #define AH2 float16_t2
+   #define AH3 float16_t3
+   #define AH4 float16_t4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define AW1 uint16_t
+   #define AW2 uint16_t2
+   #define AW3 uint16_t3
+   #define AW4 uint16_t4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define ASW1 int16_t
+   #define ASW2 int16_t2
+   #define ASW3 int16_t3
+   #define ASW4 int16_t4
+  #else
+   #define AH1 min16float
+   #define AH2 min16float2
+   #define AH3 min16float3
+   #define AH4 min16float4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define AW1 min16uint
+   #define AW2 min16uint2
+   #define AW3 min16uint3
+   #define AW4 min16uint4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define ASW1 min16int
+   #define ASW2 min16int2
+   #define ASW3 min16int3
+   #define ASW4 min16int4
+  #endif
+//==============================================================================================================================
+  // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
+  // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
+  AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
+  AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
+  AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
+  AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
+  #define AH2_AU1(x) AH2_AU1_x(AU1(x))
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) AW2_AU1_x(AU1(x))
+  #define AW4_AU2(x) AW4_AU2_x(AU2(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
+  AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
+  AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
+  AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
+  #define AU1_AH2(x) AU1_AH2_x(AH2(x))
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) AU1_AW2_x(AW2(x))
+  #define AU2_AW4(x) AU2_AW4_x(AW4(x))
+//==============================================================================================================================
+  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
+   #define AW1_AH1(x) asuint16(x)
+   #define AW2_AH2(x) asuint16(x)
+   #define AW3_AH3(x) asuint16(x)
+   #define AW4_AH4(x) asuint16(x)
+  #else
+   #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
+   #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
+   #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
+   #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
+   #define AH1_AW1(x) asfloat16(x)
+   #define AH2_AW2(x) asfloat16(x)
+   #define AH3_AW3(x) asfloat16(x)
+   #define AH4_AW4(x) asfloat16(x)
+  #else
+   #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
+   #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
+   #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
+   #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
+  #endif
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
+  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
+  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
+  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F16 (note DX frac() is different).
+  AH1 AFractH1(AH1 x){return x-floor(x);}
+  AH2 AFractH2(AH2 x){return x-floor(x);}
+  AH3 AFractH3(AH3 x){return x-floor(x);}
+  AH4 AFractH4(AH4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return rcp(x);}
+  AH2 ARcpH2(AH2 x){return rcp(x);}
+  AH3 ARcpH3(AH3 x){return rcp(x);}
+  AH4 ARcpH4(AH4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return rsqrt(x);}
+  AH2 ARsqH2(AH2 x){return rsqrt(x);}
+  AH3 ARsqH3(AH3 x){return rsqrt(x);}
+  AH4 ARsqH4(AH4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return saturate(x);}
+  AH2 ASatH2(AH2 x){return saturate(x);}
+  AH3 ASatH3(AH3 x){return saturate(x);}
+  AH4 ASatH4(AH4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         HLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #ifdef A_HLSL_6_2
+   #define AD1 float64_t
+   #define AD2 float64_t2
+   #define AD3 float64_t3
+   #define AD4 float64_t4
+  #else
+   #define AD1 double
+   #define AD2 double2
+   #define AD3 double3
+   #define AD4 double4
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 a){return a-floor(a);}
+  AD2 AFractD2(AD2 a){return a-floor(a);}
+  AD3 AFractD3(AD3 a){return a-floor(a);}
+  AD4 AFractD4(AD4 a){return a-floor(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return rcp(x);}
+  AD2 ARcpD2(AD2 x){return rcp(x);}
+  AD3 ARcpD3(AD3 x){return rcp(x);}
+  AD4 ARcpD4(AD4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return rsqrt(x);}
+  AD2 ARsqD2(AD2 x){return rsqrt(x);}
+  AD3 ARsqD3(AD3 x){return rsqrt(x);}
+  AD4 ARsqD4(AD4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return saturate(x);}
+  AD2 ASatD2(AD2 x){return saturate(x);}
+  AD3 ASatD3(AD3 x){return saturate(x);}
+  AD4 ASatD4(AD4 x){return saturate(x);}
+ #endif
+//==============================================================================================================================
+//                                                         HLSL WAVE
+//==============================================================================================================================
+ #ifdef A_WAVE
+  // Where 'x' must be a compile time literal.
+  AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_HALF
+   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
+   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
+   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
+   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
+  #endif
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                          GPU COMMON
+//
+//
+//==============================================================================================================================
+#ifdef A_GPU
+ // Negative and positive infinity.
+ #define A_INFP_F AF1_AU1(0x7f800000u)
+ #define A_INFN_F AF1_AU1(0xff800000u)
+//------------------------------------------------------------------------------------------------------------------------------
+ // Copy sign from 's' to positive 'd'.
+ AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
+ AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
+ AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
+ AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Single operation to return (useful to create a mask to use in lerp for branch free logic),
+ //  m=NaN := 0
+ //  m>=0  := 0
+ //  m<0   := 1
+ // Uses the following useful floating point logic,
+ //  saturate(+a*(-INF)==-INF) := 0
+ //  saturate( 0*(-INF)== NaN) := 0
+ //  saturate(-a*(-INF)==+INF) := 1
+ AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
+ AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
+ AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
+ AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
+ AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
+ AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
+ AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  #ifdef A_HLSL_6_2
+   #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
+   #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
+  #else
+   #define A_INFP_H AH1_AW1(0x7c00u)
+   #define A_INFN_H AH1_AW1(0xfc00u)
+  #endif
+
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
+  AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
+  AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
+  AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
+  AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
+  AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
+  AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
+  AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
+  AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
+  AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                [FIS] FLOAT INTEGER SORTABLE
+//------------------------------------------------------------------------------------------------------------------------------
+// Float to integer sortable.
+//  - If sign bit=0, flip the sign bit (positives).
+//  - If sign bit=1, flip all bits     (negatives).
+// Integer sortable to float.
+//  - If sign bit=1, flip the sign bit (positives).
+//  - If sign bit=0, flip all bits     (negatives).
+// Has nice side effects.
+//  - Larger integers are more positive values.
+//  - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
+// Burns 3 ops for conversion {shift,or,xor}.
+//==============================================================================================================================
+ AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
+ AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
+ AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
+ AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
+  AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
+  AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      [PERM] V_PERM_B32
+//------------------------------------------------------------------------------------------------------------------------------
+// Support for V_PERM_B32 started in the 3rd generation of GCN.
+//------------------------------------------------------------------------------------------------------------------------------
+// yyyyxxxx - The 'i' input.
+// 76543210
+// ========
+// HGFEDCBA - Naming on permutation.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure compiler optimizes this.
+//==============================================================================================================================
+ #ifdef A_HALF
+  AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
+  AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
+  AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
+  AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
+  AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
+  AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
+  AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
+  AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
+  AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
+  AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
+  AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
+  AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               [BUC] BYTE UNSIGNED CONVERSION
+//------------------------------------------------------------------------------------------------------------------------------
+// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
+// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
+//------------------------------------------------------------------------------------------------------------------------------
+// OPCODE NOTES
+// ============
+// GCN does not do UNORM or SNORM for bytes in opcodes.
+//  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
+//  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
+// V_PERM_B32 does byte packing with ability to zero fill bytes as well.
+//  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
+// ====   =====
+//    0 : 0
+//    1 : 1
+//     ...
+//  255 : 255
+//      : 256 (just outside the encoding range)
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
+// ====   =====
+//    0 : 0
+//    1 : 1/512
+//    2 : 1/256
+//     ...
+//   64 : 1/8
+//  128 : 1/4
+//  255 : 255/512
+//      : 1/2 (just outside the encoding range)
+//------------------------------------------------------------------------------------------------------------------------------
+// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
+// ============================================
+// r=ABuc0FromU1(i)
+//   V_CVT_F32_UBYTE0 r,i
+// --------------------------------------------
+// r=ABuc0ToU1(d,i)
+//   V_CVT_PKACCUM_U8_F32 r,i,0,d
+// --------------------------------------------
+// d=ABuc0FromU2(i)
+//   Where 'k0' is an SGPR with 0x0E0A
+//   Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
+//   V_PERM_B32 d,i.x,i.y,k0
+//   V_PK_FMA_F16 d,d,k1.x,0
+// --------------------------------------------
+// r=ABuc0ToU2(d,i)
+//   Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
+//   Where 'k1' is an SGPR with 0x????
+//   Where 'k2' is an SGPR with 0x????
+//   V_PK_FMA_F16 i,i,k0.x,0
+//   V_PERM_B32 r.x,i,i,k1
+//   V_PERM_B32 r.y,i,i,k2
+//==============================================================================================================================
+ // Peak range for 32-bit and 16-bit operations.
+ #define A_BUC_32 (255.0)
+ #define A_BUC_16 (255.0/512.0)
+//==============================================================================================================================
+ #if 1
+  // Designed to be one V_CVT_PKACCUM_U8_F32.
+  // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
+  AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u)    )&(0x000000ffu));}
+  AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
+  AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
+  AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed to be one V_CVT_F32_UBYTE*.
+  AF1 ABuc0FromU1(AU1 i){return AF1((i    )&255u);}
+  AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
+  AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
+  AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
+  AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
+   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed for 3 ops to do SOA to AOS and conversion.
+  AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed for 2 ops to do both AOS to SOA, and conversion.
+  AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
+  AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
+  AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
+  AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                 [BSC] BYTE SIGNED CONVERSION
+//------------------------------------------------------------------------------------------------------------------------------
+// Similar to [BUC].
+// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
+//------------------------------------------------------------------------------------------------------------------------------
+// ENCODING (without zero-based encoding)
+// ========
+//   0 = unused (can be used to mean something else)
+//   1 = lowest value 
+// 128 = exact zero center (zero based encoding 
+// 255 = highest value
+//------------------------------------------------------------------------------------------------------------------------------
+// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
+// This is useful if there is a desire for cleared values to decode as zero.
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
+// ====   =====
+//    0 : -127/512 (unused)
+//    1 : -126/512
+//    2 : -125/512
+//     ...
+//  128 : 0 
+//     ... 
+//  255 : 127/512
+//      : 1/4 (just outside the encoding range)
+//==============================================================================================================================
+ // Peak range for 32-bit and 16-bit operations.
+ #define A_BSC_32 (127.0)
+ #define A_BSC_16 (127.0/512.0)
+//==============================================================================================================================
+ #if 1
+  AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u)    )&(0x000000ffu));}
+  AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
+  AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
+  AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u)    )&(0x000000ffu)))^0x00000080u;}
+  AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
+  AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
+  AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ABsc0FromU1(AU1 i){return AF1((i    )&255u)-128.0;}
+  AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
+  AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
+  AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ABsc0FromZbU1(AU1 i){return AF1(((i    )&255u)^0x80u)-128.0;}
+  AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
+  AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
+  AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
+  AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
+   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These support only positive inputs.
+// Did not see value yet in specialization for range.
+// Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
+// With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
+// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
+// And co-execution would require a compiler interleaving a lot of independent work for packed usage.
+//------------------------------------------------------------------------------------------------------------------------------
+// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
+// Same with sqrt(), as this could be x*rsq() (7 ops).
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Minimize squared error across full positive range, 2 ops.
+  // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
+  AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
+  AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
+  AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
+  AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Lower precision estimation, 1 op.
+  // Minimize squared error across {smallest normal to 16384.0}.
+  AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
+  AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
+  AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
+  AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Medium precision estimation, one Newton Raphson iteration, 3 ops.
+  AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
+  AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
+  AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
+  AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
+  AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
+  AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
+  AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
+  AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    FLOAT APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
+//  - Idea dates back to SGI, then to Quake 3, etc.
+//  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
+//     - sqrt(x)=rsqrt(x)*x
+//     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
+//  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
+//------------------------------------------------------------------------------------------------------------------------------
+// These below are from perhaps less complete searching for optimal.
+// Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
+// So these match up well with the half approximations.
+//==============================================================================================================================
+ AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
+ AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
+ AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
+ AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
+ AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
+ AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
+ AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
+ AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
+ AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
+ AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
+ AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
+ AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
+ AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PQ APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
+// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
+//==============================================================================================================================
+// Helpers
+ AF1 Quart(AF1 a) { a = a * a; return a * a;}
+ AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
+ AF2 Quart(AF2 a) { a = a * a; return a * a; }
+ AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
+ AF3 Quart(AF3 a) { a = a * a; return a * a; }
+ AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
+ AF4 Quart(AF4 a) { a = a * a; return a * a; }
+ AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
+ AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
+ AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
+ AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
+ AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
+ AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
+ AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
+ AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
+ AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
+ AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
+ AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
+ AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
+ AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
+ AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
+ AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
+ AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
+ AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
+ AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
+ AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
+ AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
+ AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PARABOLIC SIN & COS
+//------------------------------------------------------------------------------------------------------------------------------
+// Approximate answers to transcendental questions.
+//------------------------------------------------------------------------------------------------------------------------------
+//==============================================================================================================================
+ #if 1
+  // Valid input range is {-1 to 1} representing {0 to 2 pi}.
+  // Output range is {-1/4 to 1/4} representing {-1 to 1}.
+  AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
+  AF2 APSinF2(AF2 x){return x*abs(x)-x;}
+  AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
+  AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
+  AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  // For a packed {sin,cos} pair,
+  //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
+  //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
+  AH1 APSinH1(AH1 x){return x*abs(x)-x;}
+  AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
+  AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
+  AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
+  AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     [ZOL] ZERO ONE LOGIC
+//------------------------------------------------------------------------------------------------------------------------------
+// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
+//------------------------------------------------------------------------------------------------------------------------------
+// 0 := false
+// 1 := true
+//------------------------------------------------------------------------------------------------------------------------------
+// AndNot(x,y)   -> !(x&y) .... One op.
+// AndOr(x,y,z)  -> (x&y)|z ... One op.
+// GtZero(x)     -> x>0.0 ..... One op.
+// Sel(x,y,z)    -> x?y:z ..... Two ops, has no precision loss.
+// Signed(x)     -> x<0.0 ..... One op.
+// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
+//------------------------------------------------------------------------------------------------------------------------------
+// OPTIMIZATION NOTES
+// ==================
+// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
+//   For example 'a.xy*k.xx+k.yy'.
+//==============================================================================================================================
+ #if 1
+  AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
+  AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
+  AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
+  AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
+  AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
+  AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
+  AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
+  AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
+  AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
+  AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
+//==============================================================================================================================
+  AU1 AZolF1ToU1(AF1 x){return AU1(x);}
+  AU2 AZolF2ToU2(AF2 x){return AU2(x);}
+  AU3 AZolF3ToU3(AF3 x){return AU3(x);}
+  AU4 AZolF4ToU4(AF4 x){return AU4(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
+  AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
+  AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
+  AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
+  AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolU1ToF1(AU1 x){return AF1(x);}
+  AF2 AZolU2ToF2(AU2 x){return AF2(x);}
+  AF3 AZolU3ToF3(AU3 x){return AF3(x);}
+  AF4 AZolU4ToF4(AU4 x){return AF4(x);}
+//==============================================================================================================================
+  AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
+  AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
+  AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
+  AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
+  AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
+  AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
+  AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
+  AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
+  AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
+  AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
+  AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
+  AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
+  AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
+  AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
+  AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
+  AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
+  AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
+  AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
+  AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
+  AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
+  AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
+  AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
+  AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
+  AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
+  AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
+  AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
+  AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
+  AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
+  AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
+  AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
+  AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
+  AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
+  AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
+  AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
+  AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
+  AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
+  AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
+//==============================================================================================================================
+  // Uses denormal trick.
+  AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
+  AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
+  AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
+  AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // AMD arch lacks a packed conversion opcode.
+  AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
+  AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
+  AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
+  AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
+//==============================================================================================================================
+  AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
+  AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
+  AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
+  AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
+  AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
+  AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
+  AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
+  AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
+  AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
+  AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
+  AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
+  AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
+  AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
+  AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
+  AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
+  AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
+  AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
+  AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
+  AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
+  AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
+  AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
+  AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
+  AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
+  AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
+  AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      COLOR CONVERSIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
+// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
+// These are branch free implementations.
+// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
+//------------------------------------------------------------------------------------------------------------------------------
+// TRANSFER FUNCTIONS
+// ==================
+// 709 ..... Rec709 used for some HDTVs
+// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
+// Pq ...... PQ native for HDR10
+// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
+// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
+// Three ... Gamma 3.0, less fast, but good for HDR.
+//------------------------------------------------------------------------------------------------------------------------------
+// KEEPING TO SPEC
+// ===============
+// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
+//  (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
+//  (b.) For 8-bit  709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
+// Also there is a slight step in the transition regions.
+// Precision of the coefficients in the spec being the likely cause.
+// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
+// This is to work around lack of hardware (typically only ROP does the conversion for free).
+// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
+// So this header keeps with the spec.
+// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
+// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
+//------------------------------------------------------------------------------------------------------------------------------
+// FOR PQ
+// ======
+// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
+// All constants are only specified to FP32 precision.
+// External PQ source reference,
+//  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
+//------------------------------------------------------------------------------------------------------------------------------
+// PACKED VERSIONS
+// ===============
+// These are the A*H2() functions.
+// There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
+// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
+// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
+//------------------------------------------------------------------------------------------------------------------------------
+// NOTES
+// =====
+// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
+//==============================================================================================================================
+ #if 1
+  AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
+  AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
+  AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
+  AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
+   return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
+  AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
+   return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
+  AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
+   return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToTwoF1(AF1 c){return sqrt(c);}
+  AF2 AToTwoF2(AF2 c){return sqrt(c);}
+  AF3 AToTwoF3(AF3 c){return sqrt(c);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
+  AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
+  AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
+ #endif
+//==============================================================================================================================
+ #if 1
+  // Unfortunately median won't work here.
+  AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
+  AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
+  AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
+   return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
+  AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
+   return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
+  AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
+   return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Unfortunately median won't work here.
+  AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromTwoF1(AF1 c){return c*c;}
+  AF2 AFromTwoF2(AF2 c){return c*c;}
+  AF3 AFromTwoF3(AF3 c){return c*c;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromThreeF1(AF1 c){return c*c*c;}
+  AF2 AFromThreeF2(AF2 c){return c*c*c;}
+  AF3 AFromThreeF3(AF3 c){return c*c*c;}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
+  AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
+  AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToTwoH1(AH1 c){return sqrt(c);}
+  AH2 AToTwoH2(AH2 c){return sqrt(c);}
+  AH3 AToTwoH3(AH3 c){return sqrt(c);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
+  AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
+  AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
+  AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
+  AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromTwoH1(AH1 c){return c*c;}
+  AH2 AFromTwoH2(AH2 c){return c*c;}
+  AH3 AFromTwoH3(AH3 c){return c*c;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromThreeH1(AH1 c){return c*c*c;}
+  AH2 AFromThreeH2(AH2 c){return c*c*c;}
+  AH3 AFromThreeH3(AH3 c){return c*c*c;}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          CS REMAP
+//==============================================================================================================================
+ // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
+ //  543210
+ //  ======
+ //  ..xxx.
+ //  yy...y
+ AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+//==============================================================================================================================
+ // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
+ //  543210
+ //  ======
+ //  .xx..x
+ //  y..yy.
+ // Details,
+ //  LANE TO 8x8 MAPPING
+ //  ===================
+ //  00 01 08 09 10 11 18 19 
+ //  02 03 0a 0b 12 13 1a 1b
+ //  04 05 0c 0d 14 15 1c 1d
+ //  06 07 0e 0f 16 17 1e 1f 
+ //  20 21 28 29 30 31 38 39 
+ //  22 23 2a 2b 32 33 3a 3b
+ //  24 25 2c 2d 34 35 3c 3d
+ //  26 27 2e 2f 36 37 3e 3f 
+ AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+  AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+ #endif
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                          REFERENCE
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// IEEE FLOAT RULES
+// ================
+//  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
+//  - {+/-}0 * {+/-}INF = NaN
+//  - -INF + (+INF) = NaN
+//  - {+/-}0 / {+/-}0 = NaN
+//  - {+/-}INF / {+/-}INF = NaN
+//  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
+//  - 0 == -0
+//  - 4/0 = +INF
+//  - 4/-0 = -INF
+//  - 4+INF = +INF
+//  - 4-INF = -INF
+//  - 4*(+INF) = +INF
+//  - 4*(-INF) = -INF
+//  - -4*(+INF) = -INF
+//  - sqrt(+INF) = +INF
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16 ENCODING
+// =============
+// fedcba9876543210
+// ----------------
+// ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
+// .eeeee..........  5-bit exponent
+// .00000..........  denormals
+// .00001..........  -14 exponent
+// .11110..........   15 exponent
+// .111110000000000  infinity
+// .11111nnnnnnnnnn  NaN with n!=0
+// s...............  sign
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16/INT16 ALIASING DENORMAL
+// ============================
+// 11-bit unsigned integers alias with half float denormal/normal values,
+//     1 = 2^(-24) = 1/16777216 ....................... first denormal value
+//     2 = 2^(-23)
+//   ...
+//  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
+//  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
+//  2047 .............................................. last normal value that still maps to integers 
+// Scaling limits,
+//  2^15 = 32768 ...................................... largest power of 2 scaling
+// Largest pow2 conversion mapping is at *32768,
+//     1 : 2^(-9) = 1/512
+//     2 : 1/256
+//     4 : 1/128
+//     8 : 1/64
+//    16 : 1/32
+//    32 : 1/16
+//    64 : 1/8
+//   128 : 1/4
+//   256 : 1/2
+//   512 : 1
+//  1024 : 2
+//  2047 : a little less than 4
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                     GPU/CPU PORTABILITY
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This is the GPU implementation.
+// See the CPU implementation for docs.
+//==============================================================================================================================
+#ifdef A_GPU
+ #define A_TRUE true
+ #define A_FALSE false
+ #define A_STATIC
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD2
+ #define retAD3 AD3
+ #define retAD4 AD4
+ #define retAF2 AF2
+ #define retAF3 AF3
+ #define retAF4 AF4
+ #define retAL2 AL2
+ #define retAL3 AL3
+ #define retAL4 AL4
+ #define retAU2 AU2
+ #define retAU3 AU3
+ #define retAU4 AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 in AD2
+ #define inAD3 in AD3
+ #define inAD4 in AD4
+ #define inAF2 in AF2
+ #define inAF3 in AF3
+ #define inAF4 in AF4
+ #define inAL2 in AL2
+ #define inAL3 in AL3
+ #define inAL4 in AL4
+ #define inAU2 in AU2
+ #define inAU3 in AU3
+ #define inAU4 in AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 inout AD2
+ #define inoutAD3 inout AD3
+ #define inoutAD4 inout AD4
+ #define inoutAF2 inout AF2
+ #define inoutAF3 inout AF3
+ #define inoutAF4 inout AF4
+ #define inoutAL2 inout AL2
+ #define inoutAL3 inout AL3
+ #define inoutAL4 inout AL4
+ #define inoutAU2 inout AU2
+ #define inoutAU3 inout AU3
+ #define inoutAU4 inout AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 out AD2
+ #define outAD3 out AD3
+ #define outAD4 out AD4
+ #define outAF2 out AF2
+ #define outAF3 out AF3
+ #define outAF4 out AF4
+ #define outAL2 out AL2
+ #define outAL3 out AL3
+ #define outAL4 out AL4
+ #define outAU2 out AU2
+ #define outAU3 out AU3
+ #define outAU4 out AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD2 x
+ #define varAD3(x) AD3 x
+ #define varAD4(x) AD4 x
+ #define varAF2(x) AF2 x
+ #define varAF3(x) AF3 x
+ #define varAF4(x) AF4 x
+ #define varAL2(x) AL2 x
+ #define varAL3(x) AL3 x
+ #define varAL4(x) AL4 x
+ #define varAU2(x) AU2 x
+ #define varAU3(x) AU3 x
+ #define varAU4(x) AU4 x
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) AD2(x,y)
+ #define initAD3(x,y,z) AD3(x,y,z)
+ #define initAD4(x,y,z,w) AD4(x,y,z,w)
+ #define initAF2(x,y) AF2(x,y)
+ #define initAF3(x,y,z) AF3(x,y,z)
+ #define initAF4(x,y,z,w) AF4(x,y,z,w)
+ #define initAL2(x,y) AL2(x,y)
+ #define initAL3(x,y,z) AL3(x,y,z)
+ #define initAL4(x,y,z,w) AL4(x,y,z,w)
+ #define initAU2(x,y) AU2(x,y)
+ #define initAU3(x,y,z) AU3(x,y,z)
+ #define initAU4(x,y,z,w) AU4(x,y,z,w)
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//==============================================================================================================================
+ #define AAbsD1(a) abs(AD1(a))
+ #define AAbsF1(a) abs(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ACosD1(a) cos(AD1(a))
+ #define ACosF1(a) cos(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ADotD2(a,b) dot(AD2(a),AD2(b))
+ #define ADotD3(a,b) dot(AD3(a),AD3(b))
+ #define ADotD4(a,b) dot(AD4(a),AD4(b))
+ #define ADotF2(a,b) dot(AF2(a),AF2(b))
+ #define ADotF3(a,b) dot(AF3(a),AF3(b))
+ #define ADotF4(a,b) dot(AF4(a),AF4(b))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AExp2D1(a) exp2(AD1(a))
+ #define AExp2F1(a) exp2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AFloorD1(a) floor(AD1(a))
+ #define AFloorF1(a) floor(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ALog2D1(a) log2(AD1(a))
+ #define ALog2F1(a) log2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMaxD1(a,b) max(a,b)
+ #define AMaxF1(a,b) max(a,b)
+ #define AMaxL1(a,b) max(a,b)
+ #define AMaxU1(a,b) max(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMinD1(a,b) min(a,b)
+ #define AMinF1(a,b) min(a,b)
+ #define AMinL1(a,b) min(a,b)
+ #define AMinU1(a,b) min(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASinD1(a) sin(AD1(a))
+ #define ASinF1(a) sin(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASqrtD1(a) sqrt(AD1(a))
+ #define ASqrtF1(a) sqrt(AF1(a))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ #define APowD1(a,b) pow(AD1(a),AF1(b))
+ #define APowF1(a,b) pow(AF1(a),AF1(b))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ #ifdef A_DUBL
+  AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
+  AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
+  AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
+  AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
+  AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
+  AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
+  AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
+  AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
+  AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
+  AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
+  AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
+  AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
+  AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
+  AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
+  AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
+  AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
+  AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
+  AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
+  AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
+  AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
+  AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
+  AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
+  AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
+  AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
+  AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
+ #endif
+//==============================================================================================================================
+ AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
+ AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
+ AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
+ AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
+ AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
+ AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
+ AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
+ AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
+ AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
+ AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
+ AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
+ AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
+ AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
+ AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
+ AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
+ AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
+ AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
+ AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
+ AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
+ AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
+ AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
+ AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
+ AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
+ AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
+ AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
+#endif
+
+#define FSR_EASU_F 1
+AU4 con0, con1, con2, con3;
+float srcW, srcH, dstW, dstH;
+vec2 bLeft, tRight;
+
+AF2 translate(AF2 pos) {
+    return AF2(pos.x * scaleX, pos.y * scaleY);
+}
+
+void setBounds(vec2 bottomLeft, vec2 topRight) {
+    bLeft = bottomLeft;
+    tRight = topRight;
+}
+
+AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(Source, translate(p), 0); return res; }
+AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(Source, translate(p), 1); return res; }
+AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(Source, translate(p), 2); return res; }
+
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                    AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// FidelityFX Super Resolution Sample
+//
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// FSR is a collection of algorithms relating to generating a higher resolution image.
+// This specific header focuses on single-image non-temporal image scaling, and related tools.
+// 
+// The core functions are EASU and RCAS:
+//  [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter.
+//  [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS.
+// RCAS needs to be applied after EASU as a separate pass.
+// 
+// Optional utility functions are:
+//  [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling.
+//  [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back.
+//  [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
+// See each individual sub-section for inline documentation.
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// FUNCTION PERMUTATIONS
+// =====================
+// *F() ..... Single item computation with 32-bit.
+// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible.
+// *Hx2() ... Processing two items in parallel with 16-bit, easier packing.
+//            Not all interfaces in this file have a *Hx2() form.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                        FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// EASU provides a high quality spatial-only scaling at relatively low cost.
+// Meaning EASU is appropiate for laptops and other low-end GPUs.
+// Quality from 1x to 4x area scaling is good.
+//------------------------------------------------------------------------------------------------------------------------------
+// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel.
+// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos.
+// This is also kept as simple as possible to have minimum runtime.
+//------------------------------------------------------------------------------------------------------------------------------
+// The lanzcos filter has negative lobes, so by itself it will introduce ringing.
+// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood,
+// and limits output to the minimum and maximum of that neighborhood.
+//------------------------------------------------------------------------------------------------------------------------------
+// Input image requirements:
+// 
+// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported)
+// Each channel needs to be in the range[0, 1]
+// Any color primaries are supported
+// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0)
+// There should be no banding in the input
+// There should be no high amplitude noise in the input
+// There should be no noise in the input that is not at input pixel granularity
+// For performance purposes, use 32bpp formats
+//------------------------------------------------------------------------------------------------------------------------------
+// Best to apply EASU at the end of the frame after tonemapping 
+// but before film grain or composite of the UI.
+//------------------------------------------------------------------------------------------------------------------------------
+// Example of including this header for D3D HLSL :
+// 
+//  #define A_GPU 1
+//  #define A_HLSL 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+// Example of including this header for Vulkan GLSL :
+// 
+//  #define A_GPU 1
+//  #define A_GLSL 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+// Example of including this header for Vulkan HLSL :
+// 
+//  #define A_GPU 1
+//  #define A_HLSL 1
+//  #define A_HLSL_6_2 1
+//  #define A_NO_16_BIT_CAST 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+//  Example of declaring the required input callbacks for GLSL :
+//  The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'.
+//  EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion.
+// 
+//  AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));}
+//  AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));}
+//  AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));}
+//  ...
+//  The FsrEasuCon function needs to be called from the CPU or GPU to set up constants.
+//  The difference in viewport and input image size is there to support Dynamic Resolution Scaling.
+//  To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1.
+//  Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer.
+//  AU4 con0,con1,con2,con3;
+//  FsrEasuCon(con0,con1,con2,con3,
+//    1920.0,1080.0,  // Viewport size (top left aligned) in the input image which is to be scaled.
+//    3840.0,2160.0,  // The size of the input image.
+//    2560.0,1440.0); // The output resolution.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      CONSTANT SETUP
+//==============================================================================================================================
+// Call to setup required constant values (works on CPU or GPU).
+A_STATIC void FsrEasuCon(
+outAU4 con0,
+outAU4 con1,
+outAU4 con2,
+outAU4 con3,
+// This the rendered image resolution being upscaled
+AF1 inputViewportInPixelsX,
+AF1 inputViewportInPixelsY,
+// This is the resolution of the resource containing the input image (useful for dynamic resolution)
+AF1 inputSizeInPixelsX,
+AF1 inputSizeInPixelsY,
+// This is the display resolution which the input image gets upscaled to
+AF1 outputSizeInPixelsX,
+AF1 outputSizeInPixelsY){
+ // Output integer position to a pixel position in viewport.
+ con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
+ con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
+ con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
+ con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
+ // Viewport pixel position to normalized image space.
+ // This is used to get upper-left of 'F' tap.
+ con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
+ con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
+ // Centers of gather4, first offset from upper-left of 'F'.
+ //      +---+---+
+ //      |   |   |
+ //      +--(0)--+
+ //      | b | c |
+ //  +---F---+---+---+
+ //  | e | f | g | h |
+ //  +--(1)--+--(2)--+
+ //  | i | j | k | l |
+ //  +---+---+---+---+
+ //      | n | o |
+ //      +--(3)--+
+ //      |   |   |
+ //      +---+---+
+ con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
+ con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY));
+ // These are from (0) instead of 'F'.
+ con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX));
+ con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
+ con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
+ con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
+ con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX));
+ con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY));
+ con3[2]=con3[3]=0;}
+
+//If the an offset into the input image resource
+A_STATIC void FsrEasuConOffset(
+    outAU4 con0,
+    outAU4 con1,
+    outAU4 con2,
+    outAU4 con3,
+    // This the rendered image resolution being upscaled
+    AF1 inputViewportInPixelsX,
+    AF1 inputViewportInPixelsY,
+    // This is the resolution of the resource containing the input image (useful for dynamic resolution)
+    AF1 inputSizeInPixelsX,
+    AF1 inputSizeInPixelsY,
+    // This is the display resolution which the input image gets upscaled to
+    AF1 outputSizeInPixelsX,
+    AF1 outputSizeInPixelsY,
+    // This is the input image offset into the resource containing it (useful for dynamic resolution)
+    AF1 inputOffsetInPixelsX,
+    AF1 inputOffsetInPixelsY) {
+    FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
+    con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX);
+    con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                   NON-PACKED 32-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(FSR_EASU_F)
+ // Input callback prototypes, need to be implemented by calling shader
+ AF4 FsrEasuRF(AF2 p);
+ AF4 FsrEasuGF(AF2 p);
+ AF4 FsrEasuBF(AF2 p);
+//------------------------------------------------------------------------------------------------------------------------------
+ // Filtering for a given tap for the scalar.
+ void FsrEasuTapF(
+ inout AF3 aC, // Accumulated color, with negative lobe.
+ inout AF1 aW, // Accumulated weight.
+ AF2 off, // Pixel offset from resolve position to tap.
+ AF2 dir, // Gradient direction.
+ AF2 len, // Length.
+ AF1 lob, // Negative lobe strength.
+ AF1 clp, // Clipping point.
+ AF3 c){ // Tap color.
+  // Rotate offset by direction.
+  AF2 v;
+  v.x=(off.x*( dir.x))+(off.y*dir.y);
+  v.y=(off.x*(-dir.y))+(off.y*dir.x);
+  // Anisotropy.
+  v*=len;
+  // Compute distance^2.
+  AF1 d2=v.x*v.x+v.y*v.y;
+  // Limit to the window as at corner, 2 taps can easily be outside.
+  d2=min(d2,clp);
+  // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
+  //  (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
+  //  |_______________________________________|   |_______________|
+  //                   base                             window
+  // The general form of the 'base' is,
+  //  (a*(b*x^2-1)^2-(a-1))
+  // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
+  AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0);
+  AF1 wA=lob*d2+AF1_(-1.0);
+  wB*=wB;
+  wA*=wA;
+  wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0));
+  AF1 w=wB*wA;
+  // Do weighted average.
+  aC+=c*w;aW+=w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Accumulate direction and length.
+ void FsrEasuSetF(
+ inout AF2 dir,
+ inout AF1 len,
+ AF2 pp,
+ AP1 biS,AP1 biT,AP1 biU,AP1 biV,
+ AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){
+  // Compute bilinear weight, branches factor out as predicates are compiler time immediates.
+  //  s t
+  //  u v
+  AF1 w = AF1_(0.0);
+  if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
+  if(biT)w=           pp.x *(AF1_(1.0)-pp.y);
+  if(biU)w=(AF1_(1.0)-pp.x)*           pp.y ;
+  if(biV)w=           pp.x *           pp.y ;
+  // Direction is the '+' diff.
+  //    a
+  //  b c d
+  //    e
+  // Then takes magnitude from abs average of both sides of 'c'.
+  // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
+  AF1 dc=lD-lC;
+  AF1 cb=lC-lB;
+  AF1 lenX=max(abs(dc),abs(cb));
+  lenX=APrxLoRcpF1(lenX);
+  AF1 dirX=lD-lB;
+  dir.x+=dirX*w;
+  lenX=ASatF1(abs(dirX)*lenX);
+  lenX*=lenX;
+  len+=lenX*w;
+  // Repeat for the y axis.
+  AF1 ec=lE-lC;
+  AF1 ca=lC-lA;
+  AF1 lenY=max(abs(ec),abs(ca));
+  lenY=APrxLoRcpF1(lenY);
+  AF1 dirY=lE-lA;
+  dir.y+=dirY*w;
+  lenY=ASatF1(abs(dirY)*lenY);
+  lenY*=lenY;
+  len+=lenY*w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrEasuF(
+ out AF3 pix,
+ AU2 ip, // Integer pixel position in output.
+ AU4 con0, // Constants generated by FsrEasuCon().
+ AU4 con1,
+ AU4 con2,
+ AU4 con3){
+//------------------------------------------------------------------------------------------------------------------------------
+  // Get position of 'f'.
+  AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
+  AF2 fp=floor(pp);
+  pp-=fp;
+//------------------------------------------------------------------------------------------------------------------------------
+  // 12-tap kernel.
+  //    b c
+  //  e f g h
+  //  i j k l
+  //    n o
+  // Gather 4 ordering.
+  //  a b
+  //  r g
+  // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
+  //    a b    <- unused (z)
+  //    r g
+  //  a b a b
+  //  r g r g
+  //    a b
+  //    r g    <- unused (z)
+  // Allowing dead-code removal to remove the 'z's.
+  AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
+  // These are from p0 to avoid pulling two constants on pre-Navi hardware.
+  AF2 p1=p0+AF2_AU2(con2.xy);
+  AF2 p2=p0+AF2_AU2(con2.zw);
+  AF2 p3=p0+AF2_AU2(con3.xy);
+  AF4 bczzR=FsrEasuRF(p0);
+  AF4 bczzG=FsrEasuGF(p0);
+  AF4 bczzB=FsrEasuBF(p0);
+  AF4 ijfeR=FsrEasuRF(p1);
+  AF4 ijfeG=FsrEasuGF(p1);
+  AF4 ijfeB=FsrEasuBF(p1);
+  AF4 klhgR=FsrEasuRF(p2);
+  AF4 klhgG=FsrEasuGF(p2);
+  AF4 klhgB=FsrEasuBF(p2);
+  AF4 zzonR=FsrEasuRF(p3);
+  AF4 zzonG=FsrEasuGF(p3);
+  AF4 zzonB=FsrEasuBF(p3);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
+  AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG);
+  AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG);
+  AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG);
+  AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG);
+  // Rename.
+  AF1 bL=bczzL.x;
+  AF1 cL=bczzL.y;
+  AF1 iL=ijfeL.x;
+  AF1 jL=ijfeL.y;
+  AF1 fL=ijfeL.z;
+  AF1 eL=ijfeL.w;
+  AF1 kL=klhgL.x;
+  AF1 lL=klhgL.y;
+  AF1 hL=klhgL.z;
+  AF1 gL=klhgL.w;
+  AF1 oL=zzonL.z;
+  AF1 nL=zzonL.w;
+  // Accumulate for bilinear interpolation.
+  AF2 dir=AF2_(0.0);
+  AF1 len=AF1_(0.0);
+  FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
+  FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
+  FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
+  FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Normalize with approximation, and cleanup close to zero.
+  AF2 dir2=dir*dir;
+  AF1 dirR=dir2.x+dir2.y;
+  AP1 zro=dirR<AF1_(1.0/32768.0);
+  dirR=APrxLoRsqF1(dirR);
+  dirR=zro?AF1_(1.0):dirR;
+  dir.x=zro?AF1_(1.0):dir.x;
+  dir*=AF2_(dirR);
+  // Transform from {0 to 2} to {0 to 1} range, and shape with square.
+  len=len*AF1_(0.5);
+  len*=len;
+  // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
+  AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y)));
+  // Anisotropic length after rotation,
+  //  x := 1.0 lerp to 'stretch' on edges
+  //  y := 1.0 lerp to 2x on edges
+  AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len);
+  // Based on the amount of 'edge',
+  // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
+  AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len;
+  // Set distance^2 clipping point to the end of the adjustable window.
+  AF1 clp=APrxLoRcpF1(lob);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Accumulation mixed with min/max of 4 nearest.
+  //    b c
+  //  e f g h
+  //  i j k l
+  //    n o
+  AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
+               AF3(klhgR.x,klhgG.x,klhgB.x));
+  AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
+               AF3(klhgR.x,klhgG.x,klhgB.x));
+  // Accumulation.
+  AF3 aC=AF3_(0.0);
+  AF1 aW=AF1_(0.0);
+  FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b
+  FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c
+  FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i
+  FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j
+  FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f
+  FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e
+  FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k
+  FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l
+  FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h
+  FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g
+  FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o
+  FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n
+//------------------------------------------------------------------------------------------------------------------------------
+  // Normalize and dering.
+  pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H)
+// Input callback prototypes, need to be implemented by calling shader
+ AH4 FsrEasuRH(AF2 p);
+ AH4 FsrEasuGH(AF2 p);
+ AH4 FsrEasuBH(AF2 p);
+//------------------------------------------------------------------------------------------------------------------------------
+ // This runs 2 taps in parallel.
+ void FsrEasuTapH(
+ inout AH2 aCR,inout AH2 aCG,inout AH2 aCB,
+ inout AH2 aW,
+ AH2 offX,AH2 offY,
+ AH2 dir,
+ AH2 len,
+ AH1 lob,
+ AH1 clp,
+ AH2 cR,AH2 cG,AH2 cB){
+  AH2 vX,vY;
+  vX=offX*  dir.xx +offY*dir.yy;
+  vY=offX*(-dir.yy)+offY*dir.xx;
+  vX*=len.x;vY*=len.y;
+  AH2 d2=vX*vX+vY*vY;
+  d2=min(d2,AH2_(clp));
+  AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0);
+  AH2 wA=AH2_(lob)*d2+AH2_(-1.0);
+  wB*=wB;
+  wA*=wA;
+  wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0));
+  AH2 w=wB*wA;
+  aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This runs 2 taps in parallel.
+ void FsrEasuSetH(
+ inout AH2 dirPX,inout AH2 dirPY,
+ inout AH2 lenP,
+ AH2 pp,
+ AP1 biST,AP1 biUV,
+ AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){
+  AH2 w = AH2_(0.0);
+  if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y);
+  if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(          pp.y);
+  // ABS is not free in the packed FP16 path.
+  AH2 dc=lD-lC;
+  AH2 cb=lC-lB;
+  AH2 lenX=max(abs(dc),abs(cb));
+  lenX=ARcpH2(lenX);
+  AH2 dirX=lD-lB;
+  dirPX+=dirX*w;
+  lenX=ASatH2(abs(dirX)*lenX);
+  lenX*=lenX;
+  lenP+=lenX*w;
+  AH2 ec=lE-lC;
+  AH2 ca=lC-lA;
+  AH2 lenY=max(abs(ec),abs(ca));
+  lenY=ARcpH2(lenY);
+  AH2 dirY=lE-lA;
+  dirPY+=dirY*w;
+  lenY=ASatH2(abs(dirY)*lenY);
+  lenY*=lenY;
+  lenP+=lenY*w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrEasuH(
+ out AH3 pix,
+ AU2 ip,
+ AU4 con0,
+ AU4 con1,
+ AU4 con2,
+ AU4 con3){
+//------------------------------------------------------------------------------------------------------------------------------
+  AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
+  AF2 fp=floor(pp);
+  pp-=fp;
+  AH2 ppp=AH2(pp);
+//------------------------------------------------------------------------------------------------------------------------------
+  AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
+  AF2 p1=p0+AF2_AU2(con2.xy);
+  AF2 p2=p0+AF2_AU2(con2.zw);
+  AF2 p3=p0+AF2_AU2(con3.xy);
+  AH4 bczzR=FsrEasuRH(p0);
+  AH4 bczzG=FsrEasuGH(p0);
+  AH4 bczzB=FsrEasuBH(p0);
+  AH4 ijfeR=FsrEasuRH(p1);
+  AH4 ijfeG=FsrEasuGH(p1);
+  AH4 ijfeB=FsrEasuBH(p1);
+  AH4 klhgR=FsrEasuRH(p2);
+  AH4 klhgG=FsrEasuGH(p2);
+  AH4 klhgB=FsrEasuBH(p2);
+  AH4 zzonR=FsrEasuRH(p3);
+  AH4 zzonG=FsrEasuGH(p3);
+  AH4 zzonB=FsrEasuBH(p3);
+//------------------------------------------------------------------------------------------------------------------------------
+  AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG);
+  AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG);
+  AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG);
+  AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG);
+  AH1 bL=bczzL.x;
+  AH1 cL=bczzL.y;
+  AH1 iL=ijfeL.x;
+  AH1 jL=ijfeL.y;
+  AH1 fL=ijfeL.z;
+  AH1 eL=ijfeL.w;
+  AH1 kL=klhgL.x;
+  AH1 lL=klhgL.y;
+  AH1 hL=klhgL.z;
+  AH1 gL=klhgL.w;
+  AH1 oL=zzonL.z;
+  AH1 nL=zzonL.w;
+  // This part is different, accumulating 2 taps in parallel.
+  AH2 dirPX=AH2_(0.0);
+  AH2 dirPY=AH2_(0.0);
+  AH2 lenP=AH2_(0.0);
+  FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL));
+  FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL));
+  AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g);
+  AH1 len=lenP.r+lenP.g;
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 dir2=dir*dir;
+  AH1 dirR=dir2.x+dir2.y;
+  AP1 zro=dirR<AH1_(1.0/32768.0);
+  dirR=APrxLoRsqH1(dirR);
+  dirR=zro?AH1_(1.0):dirR;
+  dir.x=zro?AH1_(1.0):dir.x;
+  dir*=AH2_(dirR);
+  len=len*AH1_(0.5);
+  len*=len;
+  AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y)));
+  AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len);
+  AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len;
+  AH1 clp=APrxLoRcpH1(lob);
+//------------------------------------------------------------------------------------------------------------------------------
+  // FP16 is different, using packed trick to do min and max in same operation.
+  AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x)));
+  AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x)));
+  AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x)));
+  // This part is different for FP16, working pairs of taps at a time.
+  AH2 pR=AH2_(0.0);
+  AH2 pG=AH2_(0.0);
+  AH2 pB=AH2_(0.0);
+  AH2 pW=AH2_(0.0);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw);
+  AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y);
+  AH1 aW=pW.x+pW.y;
+//------------------------------------------------------------------------------------------------------------------------------
+  // Slightly different for FP16 version due to combined min and max.
+  pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                      FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
+// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
+// RCAS also has a built in process to limit sharpening of what it detects as possible noise.
+// RCAS sharper does not support scaling, as it should be applied after EASU scaling.
+// Pass EASU output straight into RCAS, no color conversions necessary.
+//------------------------------------------------------------------------------------------------------------------------------
+// RCAS is based on the following logic.
+// RCAS uses a 5 tap filter in a cross pattern (same as CAS),
+//    w                n
+//  w 1 w  for taps  w m e 
+//    w                s
+// Where 'w' is the negative lobe weight.
+//  output = (w*(n+e+w+s)+m)/(4*w+1)
+// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
+//  0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
+//  1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
+// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
+// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
+// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
+// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
+// This stabilizes RCAS.
+// RCAS does a simple highpass which is normalized against the local contrast then shaped,
+//       0.25
+//  0.25  -1  0.25
+//       0.25
+// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
+//
+//  GLSL example for the required callbacks :
+// 
+//  AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));}
+//  void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b)
+//  {
+//    //do any simple input color conversions here or leave empty if none needed
+//  }
+//  
+//  FsrRcasCon need to be called from the CPU or GPU to set up constants.
+//  Including a GPU example here, the 'con' value would be stored out to a constant buffer.
+// 
+//  AU4 con;
+//  FsrRcasCon(con,
+//   0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
+// ---------------
+// RCAS sharpening supports a CAS-like pass-through alpha via,
+//  #define FSR_RCAS_PASSTHROUGH_ALPHA 1
+// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise.
+// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define,
+//  #define FSR_RCAS_DENOISE 1
+//==============================================================================================================================
+// This is set at the limit of providing unnatural results for sharpening.
+#define FSR_RCAS_LIMIT (0.25-(1.0/16.0))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      CONSTANT SETUP
+//==============================================================================================================================
+// Call to setup required constant values (works on CPU or GPU).
+A_STATIC void FsrRcasCon(
+outAU4 con,
+// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
+AF1 sharpness){
+ // Transform from stops to linear value.
+ sharpness=AExp2F1(-sharpness);
+ varAF2(hSharp)=initAF2(sharpness,sharpness);
+ con[0]=AU1_AF1(sharpness);
+ con[1]=AU1_AH2_AF2(hSharp);
+ con[2]=0;
+ con[3]=0;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                   NON-PACKED 32-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(FSR_RCAS_F)
+ // Input callback prototypes that need to be implemented by calling shader
+ AF4 FsrRcasLoadF(ASU2 p);
+ void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasF(
+ out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
+ out AF1 pixG,
+ out AF1 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AF1 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // Algorithm uses minimal 3x3 pixel neighborhood.
+  //    b 
+  //  d e f
+  //    h
+  ASU2 sp=ASU2(ip);
+  AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb;
+  AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AF4 ee=FsrRcasLoadF(sp);
+   AF3 e=ee.rgb;pixA=ee.a;
+  #else
+   AF3 e=FsrRcasLoadF(sp).rgb;
+  #endif
+  AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb;
+  AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb;
+  // Rename (32-bit) or regroup (16-bit).
+  AF1 bR=b.r;
+  AF1 bG=b.g;
+  AF1 bB=b.b;
+  AF1 dR=d.r;
+  AF1 dG=d.g;
+  AF1 dB=d.b;
+  AF1 eR=e.r;
+  AF1 eG=e.g;
+  AF1 eB=e.b;
+  AF1 fR=f.r;
+  AF1 fG=f.g;
+  AF1 fB=f.b;
+  AF1 hR=h.r;
+  AF1 hG=h.g;
+  AF1 hB=h.b;
+  // Run optional input transform.
+  FsrRcasInputF(bR,bG,bB);
+  FsrRcasInputF(dR,dG,dB);
+  FsrRcasInputF(eR,eG,eB);
+  FsrRcasInputF(fR,fG,fB);
+  FsrRcasInputF(hR,hG,hB);
+  // Luma times 2.
+  AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG);
+  AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG);
+  AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG);
+  AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG);
+  AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG);
+  // Noise detection.
+  AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL;
+  nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL)));
+  nz=AF1_(-0.5)*nz+AF1_(1.0);
+  // Min and max of ring.
+  AF1 mn4R=min(AMin3F1(bR,dR,fR),hR);
+  AF1 mn4G=min(AMin3F1(bG,dG,fG),hG);
+  AF1 mn4B=min(AMin3F1(bB,dB,fB),hB);
+  AF1 mx4R=max(AMax3F1(bR,dR,fR),hR);
+  AF1 mx4G=max(AMax3F1(bG,dG,fG),hG);
+  AF1 mx4B=max(AMax3F1(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AF2 peakC=AF2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R);
+  AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G);
+  AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B);
+  AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
+  AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
+  AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
+  AF1 lobeR=max(-hitMinR,hitMaxR);
+  AF1 lobeG=max(-hitMinG,hitMaxG);
+  AF1 lobeB=max(-hitMinB,hitMaxB);
+  AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x);
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;
+  return;} 
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                  NON-PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H)
+ // Input callback prototypes that need to be implemented by calling shader
+ AH4 FsrRcasLoadH(ASW2 p);
+ void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasH(
+ out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
+ out AH1 pixG,
+ out AH1 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AH1 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // Sharpening algorithm uses minimal 3x3 pixel neighborhood.
+  //    b 
+  //  d e f
+  //    h
+  ASW2 sp=ASW2(ip);
+  AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb;
+  AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee=FsrRcasLoadH(sp);
+   AH3 e=ee.rgb;pixA=ee.a;
+  #else
+   AH3 e=FsrRcasLoadH(sp).rgb;
+  #endif
+  AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb;
+  AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb;
+  // Rename (32-bit) or regroup (16-bit).
+  AH1 bR=b.r;
+  AH1 bG=b.g;
+  AH1 bB=b.b;
+  AH1 dR=d.r;
+  AH1 dG=d.g;
+  AH1 dB=d.b;
+  AH1 eR=e.r;
+  AH1 eG=e.g;
+  AH1 eB=e.b;
+  AH1 fR=f.r;
+  AH1 fG=f.g;
+  AH1 fB=f.b;
+  AH1 hR=h.r;
+  AH1 hG=h.g;
+  AH1 hB=h.b;
+  // Run optional input transform.
+  FsrRcasInputH(bR,bG,bB);
+  FsrRcasInputH(dR,dG,dB);
+  FsrRcasInputH(eR,eG,eB);
+  FsrRcasInputH(fR,fG,fB);
+  FsrRcasInputH(hR,hG,hB);
+  // Luma times 2.
+  AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG);
+  AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG);
+  AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG);
+  AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG);
+  AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG);
+  // Noise detection.
+  AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL;
+  nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL)));
+  nz=AH1_(-0.5)*nz+AH1_(1.0);
+  // Min and max of ring.
+  AH1 mn4R=min(AMin3H1(bR,dR,fR),hR);
+  AH1 mn4G=min(AMin3H1(bG,dG,fG),hG);
+  AH1 mn4B=min(AMin3H1(bB,dB,fB),hB);
+  AH1 mx4R=max(AMax3H1(bR,dR,fR),hR);
+  AH1 mx4G=max(AMax3H1(bG,dG,fG),hG);
+  AH1 mx4B=max(AMax3H1(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AH2 peakC=AH2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R);
+  AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G);
+  AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B);
+  AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
+  AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
+  AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
+  AH1 lobeR=max(-hitMinR,hitMaxR);
+  AH1 lobeG=max(-hitMinG,hitMaxG);
+  AH1 lobeB=max(-hitMinB,hitMaxB);
+  AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x;
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2)
+ // Input callback prototypes that need to be implemented by the calling shader
+ AH4 FsrRcasLoadHx2(ASW2 p);
+ void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store.
+ void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
+  #ifdef A_HLSL
+   // Invoke a slower path for DX only, since it won't allow uninitialized values.
+   pix0.a=pix1.a=0.0;
+  #endif
+  pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
+  pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasHx2(
+ // Output values are for 2 8x8 tiles in a 16x8 region.
+ //  pix<R,G,B>.x =  left 8x8 tile
+ //  pix<R,G,B>.y = right 8x8 tile
+ // This enables later processing to easily be packed as well.
+ out AH2 pixR,
+ out AH2 pixG,
+ out AH2 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AH2 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // No scaling algorithm uses minimal 3x3 pixel neighborhood.
+  ASW2 sp0=ASW2(ip);
+  AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb;
+  AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee0=FsrRcasLoadHx2(sp0);
+   AH3 e0=ee0.rgb;pixA.r=ee0.a;
+  #else
+   AH3 e0=FsrRcasLoadHx2(sp0).rgb;
+  #endif
+  AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb;
+  AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb;
+  ASW2 sp1=sp0+ASW2(8,0);
+  AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb;
+  AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee1=FsrRcasLoadHx2(sp1);
+   AH3 e1=ee1.rgb;pixA.g=ee1.a;
+  #else
+   AH3 e1=FsrRcasLoadHx2(sp1).rgb;
+  #endif
+  AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb;
+  AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb;
+  // Arrays of Structures to Structures of Arrays conversion.
+  AH2 bR=AH2(b0.r,b1.r);
+  AH2 bG=AH2(b0.g,b1.g);
+  AH2 bB=AH2(b0.b,b1.b);
+  AH2 dR=AH2(d0.r,d1.r);
+  AH2 dG=AH2(d0.g,d1.g);
+  AH2 dB=AH2(d0.b,d1.b);
+  AH2 eR=AH2(e0.r,e1.r);
+  AH2 eG=AH2(e0.g,e1.g);
+  AH2 eB=AH2(e0.b,e1.b);
+  AH2 fR=AH2(f0.r,f1.r);
+  AH2 fG=AH2(f0.g,f1.g);
+  AH2 fB=AH2(f0.b,f1.b);
+  AH2 hR=AH2(h0.r,h1.r);
+  AH2 hG=AH2(h0.g,h1.g);
+  AH2 hB=AH2(h0.b,h1.b);
+  // Run optional input transform.
+  FsrRcasInputHx2(bR,bG,bB);
+  FsrRcasInputHx2(dR,dG,dB);
+  FsrRcasInputHx2(eR,eG,eB);
+  FsrRcasInputHx2(fR,fG,fB);
+  FsrRcasInputHx2(hR,hG,hB);
+  // Luma times 2.
+  AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG);
+  AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG);
+  AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG);
+  AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG);
+  AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG);
+  // Noise detection.
+  AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL;
+  nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL)));
+  nz=AH2_(-0.5)*nz+AH2_(1.0);
+  // Min and max of ring.
+  AH2 mn4R=min(AMin3H2(bR,dR,fR),hR);
+  AH2 mn4G=min(AMin3H2(bG,dG,fG),hG);
+  AH2 mn4B=min(AMin3H2(bB,dB,fB),hB);
+  AH2 mx4R=max(AMax3H2(bR,dR,fR),hR);
+  AH2 mx4G=max(AMax3H2(bG,dG,fG),hG);
+  AH2 mx4B=max(AMax3H2(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AH2 peakC=AH2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R);
+  AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G);
+  AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B);
+  AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
+  AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
+  AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
+  AH2 lobeR=max(-hitMinR,hitMaxR);
+  AH2 lobeG=max(-hitMinG,hitMaxG);
+  AH2 lobeB=max(-hitMinB,hitMaxB);
+  AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x);
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                          FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts.
+// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel.
+// The 'Lfga*()' functions provide a convenient way to introduce grain.
+// These functions limit grain based on distance to signal limits.
+// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality.
+// Grain application should be done in a linear colorspace.
+// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased).
+//------------------------------------------------------------------------------------------------------------------------------
+// Usage,
+//   FsrLfga*(
+//    color, // In/out linear colorspace color {0 to 1} ranged.
+//    grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain.
+//    amount); // Amount of grain (0 to 1} ranged.
+//------------------------------------------------------------------------------------------------------------------------------
+// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)'
+//==============================================================================================================================
+#if defined(A_GPU)
+ // Maximum grain is the minimum distance to the signal limit.
+ void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ // Half precision version (slower).
+ void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Packed half precision version (faster).
+ void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){
+  cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                          FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear.
+// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering.
+//------------------------------------------------------------------------------------------------------------------------------
+// Reversible tonemapper usage,
+//  FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}.
+//  FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}.
+//==============================================================================================================================
+#if defined(A_GPU)
+ void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));}
+ // The extra max solves the c=1.0 case (which is a /0).
+ void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));}
+ void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
+  AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;}
+ void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
+  AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                       FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
+// Gamma 2.0 is used so that the conversion back to linear is just to square the color.
+// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively.
+// Given good non-biased temporal blue noise as dither input,
+// the output dither will temporally conserve energy.
+// This is done by choosing the linear nearest step point instead of perceptual nearest.
+// See code below for details.
+//------------------------------------------------------------------------------------------------------------------------------
+// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION
+// ===============================================
+// - Output is 'uint(floor(saturate(n)*255.0+0.5))'.
+// - Thus rounding is to nearest.
+// - NaN gets converted to zero.
+// - INF is clamped to {0.0 to 1.0}.
+//==============================================================================================================================
+#if defined(A_GPU)
+ // Hand tuned integer position to dither value, with more values than simple checkerboard.
+ // Only 32-bit has enough precision for this compddation.
+ // Output is {0 to <1}.
+ AF1 FsrTepdDitF(AU2 p,AU1 f){
+  AF1 x=AF1_(p.x+f);
+  AF1 y=AF1_(p.y);
+  // The 1.61803 golden ratio.
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  // Number designed to provide a good visual pattern.
+  AF1 b=AF1_(1.0/3.69);
+  x=x*a+(y*b);
+  return AFractF1(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This version is 8-bit gamma 2.0.
+ // The 'c' input is {0 to 1}.
+ // Output is {0 to 1} ready for image store.
+ void FsrTepdC8F(inout AF3 c,AF1 dit){
+  AF3 n=sqrt(c);
+  n=floor(n*AF3_(255.0))*AF3_(1.0/255.0);
+  AF3 a=n*n;
+  AF3 b=n+AF3_(1.0/255.0);b=b*b;
+  // Ratio of 'a' to 'b' required to produce 'c'.
+  // APrxLoRcpF1() won't work here (at least for very high dynamic ranges).
+  // APrxMedRcpF1() is an IADD,FMA,MUL.
+  AF3 r=(c-b)*APrxMedRcpF3(a-b);
+  // Use the ratio as a cutoff to choose 'a' or 'b'.
+  // AGtZeroF1() is a MUL.
+  c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This version is 10-bit gamma 2.0.
+ // The 'c' input is {0 to 1}.
+ // Output is {0 to 1} ready for image store.
+ void FsrTepdC10F(inout AF3 c,AF1 dit){
+  AF3 n=sqrt(c);
+  n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0);
+  AF3 a=n*n;
+  AF3 b=n+AF3_(1.0/1023.0);b=b*b;
+  AF3 r=(c-b)*APrxMedRcpF3(a-b);
+  c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ AH1 FsrTepdDitH(AU2 p,AU1 f){
+  AF1 x=AF1_(p.x+f);
+  AF1 y=AF1_(p.y);
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  AF1 b=AF1_(1.0/3.69);
+  x=x*a+(y*b);
+  return AH1(AFractF1(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC8H(inout AH3 c,AH1 dit){
+  AH3 n=sqrt(c);
+  n=floor(n*AH3_(255.0))*AH3_(1.0/255.0);
+  AH3 a=n*n;
+  AH3 b=n+AH3_(1.0/255.0);b=b*b;
+  AH3 r=(c-b)*APrxMedRcpH3(a-b);
+  c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC10H(inout AH3 c,AH1 dit){
+  AH3 n=sqrt(c);
+  n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0);
+  AH3 a=n*n;
+  AH3 b=n+AH3_(1.0/1023.0);b=b*b;
+  AH3 r=(c-b)*APrxMedRcpH3(a-b);
+  c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));}
+//==============================================================================================================================
+ // This computes dither for positions 'p' and 'p+{8,0}'.
+ AH2 FsrTepdDitHx2(AU2 p,AU1 f){
+  AF2 x;
+  x.x=AF1_(p.x+f);
+  x.y=x.x+AF1_(8.0);
+  AF1 y=AF1_(p.y);
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  AF1 b=AF1_(1.0/3.69);
+  x=x*AF2_(a)+AF2_(y*b);
+  return AH2(AFractF2(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
+  AH2 nR=sqrt(cR);
+  AH2 nG=sqrt(cG);
+  AH2 nB=sqrt(cB);
+  nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0);
+  nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0);
+  nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0);
+  AH2 aR=nR*nR;
+  AH2 aG=nG*nG;
+  AH2 aB=nB*nB;
+  AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR;
+  AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG;
+  AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB;
+  AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
+  AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
+  AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
+  cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0));
+  cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0));
+  cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
+  AH2 nR=sqrt(cR);
+  AH2 nG=sqrt(cG);
+  AH2 nB=sqrt(cB);
+  nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0);
+  nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0);
+  nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0);
+  AH2 aR=nR*nR;
+  AH2 aG=nG*nG;
+  AH2 aB=nB*nB;
+  AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR;
+  AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG;
+  AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB;
+  AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
+  AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
+  AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
+  cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0));
+  cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0));
+  cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));}
+#endif
+
+
+float insideBox(vec2 v) {
+    vec2 s = step(bLeft, v) - step(tRight, v);
+    return s.x * s.y;   
+}
+
+AF2 translateDest(AF2 pos) {
+    AF2 translatedPos = AF2(pos.x, pos.y);
+    translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x;
+    translatedPos.y = dstY0 < dstY1 ? dstY1 + dstY0 - translatedPos.y - 1 : translatedPos.y;
+    return translatedPos;
+}
+
+void CurrFilter(AU2 pos)
+{
+    if((insideBox(vec2(pos.x, pos.y))) == 0) {
+        imageStore(imgOutput, ASU2(pos.x, pos.y), AF4(0,0,0,1));
+       return;
+    }
+    AF3 c;
+    FsrEasuF(c, AU2(pos.x - bLeft.x, pos.y - bLeft.y), con0, con1, con2, con3);
+    imageStore(imgOutput, ASU2(translateDest(pos)), AF4(c, 1));
+}
+
+void main() {
+	srcW = abs(srcX1 - srcX0);
+	srcH = abs(srcY1 - srcY0);
+	dstW = abs(dstX1 - dstX0);
+	dstH = abs(dstY1 - dstY0);
+
+	AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
+
+	setBounds(vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1),
+	    vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0));
+
+	// Upscaling
+	FsrEasuCon(con0, con1, con2, con3,
+	srcW, srcH,  // Viewport size (top left aligned) in the input image which is to be scaled.
+	srcW, srcH,  // The size of the input image.
+	dstW, dstH); // The output resolution.
+
+	CurrFilter(gxy);
+	gxy.x += 8u;
+	CurrFilter(gxy);
+	gxy.y += 8u;
+	CurrFilter(gxy);
+	gxy.x -= 8u;
+	CurrFilter(gxy);
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv
new file mode 100644
index 0000000000000000000000000000000000000000..c15b72ec6c278e354720e2518f556e6328973ab3
GIT binary patch
literal 44672
zcma)_1-M<s8Ll@(aQC9Y-7R=R2o52*6DJTO0wGuk?(W6CXesVe3KS@XVr}sPEwp%n
zLJPg`yU+T<oVmTvb9b{k-}n78|IDnl_CD+EGjy4DiRoIcX<9S3rf+@Lz14or(V7;e
z)taf*z1?oU&5qlwJZ}8pmDgNrRUKw+b!q$QGh1uARyW$N93uw|AEjX|c_wnqPQ3aY
z9Gh_H@6m<-O-p+#q#GV(Y&Vn{vGwY^+P)K)9lF~3;BNTOKs%s~yV0li)jQc|E$nl*
z?Q3+h&sEqLXxrE9WS_sVFRFcK`=2ni|3#+SySJ9o-h1c}%^m+bYFr$B+}J_wdY1r?
z9y+u=z9qo}hmRje1Ml8io|gFG+ZG8N<1*-pUlu%k)VOy1a^Rohr*Exh{I&xouF-dm
zWe*;@#`-ys+kLN0JF&s5&>q@g7jIG<Pv2Ubw%4}9NBzHYtbsi_)&wWVTHuZx-C8}(
z(SOjGzN>XClw(+TYFxM02HJNVFut_Ao*j9+w>H+^Z|uZvM*oz=92=o`tabNR@1K0e
z9n>V!r`J@UZmrGr=|6JJx)axJubB3}Q|;Yb+knT99WZL#hymk=Fk`Dzgg#rNj~VS8
zPuJQ3JZ{MN-lHds8a%G;6Tfb)?Xe9UJ%0S?k=qOzI^NiBt$yYlJ*NNgVf&+Wtn1FS
zeaDU6Y{0k){reXE?1Vn1@!9PspUq33T??N*fAZ;F`s^{)XV%sMKlyArdepK9PgryP
z^5DX${^9Snwjb@F(WBbevD|rvdrGzK13UQY;e$H3rvvusTZ3tbj~X*!{4T?X95QC~
z*zsGA>NkAikP+h;nD!z6FZ=G=r)v$3eaGRGhIERZt~Km`*_ychv5lTEo{l>QnsfO7
zwCzrRv$Q5)Yu03^0b`uaHJlk2H%G@rX~5U>VkGT=O*^nRv;i!R`e<;+HQlAXo`Z*v
zZR-c2j~FtljmL^d4{T#E_(3Daw6Q0NbLrjdc-p3>9fyqXSmCCofvc@Q_22)sfY#u(
zmwpcer{BZDV>rq5IRZRz_>OISWE=0)#z(dB&TTxojdy9|qroG%pp2a&=AsiHBW~v5
zSaF{Y{W$Sv;+a}!f}62)Ozy!GnAz%V#K(Lm(~cQFv5ijw4{z7owRI}EJSR`<;8V}Z
z(>wUobMlN1-kg)&Tj#>LR}LRMr1$8F#dUWM`oSEYBNxz4y$AFeGH!g~cRsd``#|^B
zCE!gbj2+u|_=xdC#&TV8v#|Dyr}}hly&@hkc7W%J*U+04f4kxzRQ%J5e^v2+SNw;H
z|5$O~RLgp%t$3G;cdK~!icjB(cWccKA3S{I5T<xEZ%Mp^b!e2~`_*pVOrqi1)?B=3
z({^aPmwsIut@P`PHhA3lPWieuTIt6}3;jCf<3mhaEB&TN8#idch#}mYQj4D>+FJX*
z(xo-01@m$p7Y7gQxW%{mlJN1(9o}<b8Ti;C9q-92fOi}{VeFtG9kDCHe_Destu+hZ
zwJY03g>92gwyv!$E8ef-yH<S9itk(TAr&89@sSlDTk%6Get5-?toU&iKcV8MRs4*K
zpI`9{D}GtUuc-KS6~Ce4e&+mXow~MeulPL`zpoST(s~TuaZl;edJ8<5yOQU|C*VWc
z_sy=Y&pO3)ZGF{=cWHg2Z?BH?=KD@IJ_~cPl=aQfiFawu3_qlOzIJKNKNWXvEzrrY
zYip5;FH!N8E51%A-lerUd@yfVj%{o3pn)CrZV%=?$B!}XYh-75S?_Le-k+x0hjp@d
zX&nR~(&U{09yFjM&!J%Qa#eQlqdLWPZB2obcdGrYPWCRXOXThMlP;~B!9yE-hdMVN
z$MLwB8-(~-akF+Wi+ML`-^;-R2ThvPzp3LbZN<I*-Nx=`PW|7j>>o7t-i`gk%Kj02
z_yI$QB>%^i{gcL?{GV3#FB*IACjXa}{cHGvBlaJb{NGge?;3mZe_z>uXzab4{6ALq
zX}EDunlyS;@=psd=cij^PyX(eeTK%K{4-YeS>VpUpV^(?*(&>76<+}Ee8zvl%D!mD
zmx4P#@n5F0uTb&T;Ep%`YgG1iI`J;84dDZE_8i*;JgC8&f`>P_FZh54ZwVgS;BCMT
z#Mte@2R67rctnGD2JheC-M|h+|2@H@8@vzLg4G9rJ3cRVZ4Ctfv{qeOgW(;YAG@}O
zbh34A4X^lt6(3pg(G?$4@q;Qpw&LR|KB3}=RQ%A2A6D_n6`xY^V=I1q#ZRdCi4{Mo
z;wM-9)QX>8@iQxacEvBK_(c`Jq~e!W{IZH)Uhyj`er3h4uK2YTzrNx(R{YkA-(K-M
zDt>3h@2dFS6~CwA_g4J=ia%KKhb#VQ#h-+8-K@@Si0i0p>zNMjZLfV@y#Svyc68hJ
zQWg73#ows-2NnOM;$K$$U+{6`#|{}kX#aM9Gx0(Br|Y<DYgRbd@zh#ptL$@DeEy0r
z*ok*(^?-N$49jQYPPVSCr8@C0t>xhzKjU_7t<cHVwY4&Q<nU2zW)AyQ_RT8Zx8nVb
zA24xh{Lsoitm6B3;$2!};T=E6cWsTUY!kG#f3EM^I=GXK&l(j!w&KTE{Dg|1RPj?P
zep<!PsQ9H7zr5mCR{W}pUsLhxDt<%7e^K#!D}H~)AFTL86@R4Sk5&AMia%NLKUDmc
zioaIz*DL;~ioa3uw<`Wl#Xqh1=N135;$Kz#>xzF{@$V}B?~2dB5548}JyXSJsralF
zpS|L9R($S?&s*^&E53Bam#z466<?v^D^+}zimz7jjVs=(;+s~yPsRIIe2a>2Rq<^q
zzDLFPs`x$?-?!oeDn6*<Ln=P3;u9)<NW~AW_@s&-UhyL<KDpvkDt>0g&#w5n6+f@y
z7gYSBieFOkODld;#c!$jZ56-0;&)X1&Whh%@p~)&NW~xT#JjYfg7X-R_u}rYSHV-C
zD-Q0*ZxcE`FU-k9s-LdmZmk92!$%I=zWoTh`3$iX+MvdN9dYqoe%%i4<E-|3e$Ng*
z^*w&Q4$e=l9rpF%KlRtOwL!%<toX(i?^W?lE5235`@?zm$2(|8|GRhaEy4%Dc}6$Y
zzJCXA>^w_{51V)>W1GJH`@&%(_T6SQ&+4`ubui6%9yD&*C%MMq`_tIEw|}>Y|E{CQ
z9=Q3~(G$k#>1VVDJNb2OJyh{WD*kB2AFKG|6@Q}QPgeY?ia%5FXDj}E#b2)Y8x{Xc
z#Xm0i7R-RpX?-ox(X^r|tMN87RpZC{w&pX^`0+8bt<79$#uz($p>fK$wYdw8Qrg=5
zg*IKIEmUasXTHS>&1bS|OLA<^&#aEa`ZlLxpU*B1_7&a(9J|k9wU=Bwa@*hQPP=0>
zUasAj+}~Bt1YWS={@z;tvxa}#@CD#=(Tn}`Ae;Hb^MY_`E!IA|@|%WM`%vpae#a|T
zpAYWb$~*FO<u_EVwif#wjotjl&q=Em`@D_ab<sW_t(x|E;ErQ44quDGoqKKS`Y+sI
z^&V8>TxgT$Fm6S-arUj>N)1-e;duJMjn8;C1FM<KJX_M5$8p>LRt;9~!8yAft+DbB
zt!ryXuw#<j$KJH|;h4?253M=XdoY9h(walPJoNcrM-NtbL=)p!^hy7=iP1JUnGd10
z--(S)-h<!c9ZH+plS7~KNUpLq*NIIo=c)(aZaAsQWuMy0afqF}vuQndtWkaut^Moo
z+TTOF7{@dmD>VF3urIl>FSps`jl<t)^A5Xn{65D6M}B^!cKIh9<BwQ#-|#Qt$*=u;
z4$sxtXVJ)O5mwym*gpS%4(DVJ{2h;R>o?r%U(NiEZ!Wmkl>T!!HuJ~+Gqf2Rdk?U-
znK{fS_jf#M;r@<CZoZ|#u9N590%TdXvC(vTg)hj+mxJhM{PK-K&HQrjPrg!LPq<R}
zrjU%!I(F&sZ?BK@JE*cB+qO@B#CZMB>+s(s{O%6>RQ@Ua`RAV7DE2OP)_QvC`cL2H
zvp=#NWAVHm#$qnt;Qhggd(lO1wbl*)nHOWLHMNc<;qOj5_u&<}rk(GJaL+C4jdl*2
z=U=JaiY8Z&-!n~YslDH5=8WHeo7htO8E;O;m9cA~IZvh5uhGib1Dn`VJHFA3Prp~9
z>7(X+cwWAN#=oijZ3yK}<)6b{ulRoj_gwQln-7CF(sg>dH0JQ!nqLQA`g(53eJ^Wk
z@E&06FlI$Edk!ywW}P`4zqyt}H`GJEcEi1&np^JYQSIS=9xZv#iu-vqcJurBu;jZp
zJmcR3?kQz{{r$Yizp32Mi*oP(V*p+Y`djZo@M*y6#>;&lSJOVe!N$by`?_}fS3eA%
z^^v~@_r4PQC-6<+D^i91_&Gq$`=7_!tajE1+OF+iD4qlM={?!cxUR8$AzDAjhTEP{
z@$)KvO~vmhxbFde-c3HapLfH3&+zkZ$set_pLa{UpLfG;{k&UpKkt^@&$}h}^KQxg
zyjyZV?}j@+e%>v)pLa{{=iQR~c~{<xS@zz)Bi#1~@2MLy$I#YJ{GWP!c5Zwg#TK7k
z(A1A14^3-V{?9n|nNies<NrKgJV&%|#0q;(t0(uKVEgx6PVT+H>hamT@$o(%pMB8O
z)BnC;<J8mte&F=4-Tu9X)RTK4*#5ouB=;aR_4o{Ke7qmUX9${l`X355PCfk(1E+uO
z_U|>Sp4`L1_V2wcxeq{7kI#XPkN3a$j6hRQ|0BW1si*%@;PkKE{=Fxto7-!A0*BYV
zpE=C!b$oE6>+kg@?}>jOY=?k1r&UjmL&4q~${ds6>c1w=oQH$eyk{it2(WS93)1(I
zaP`C;1y(bzUsJ<mu=%u2ZnUGp)~3z6y>F;x4yS;f=W-2?K~s;<v5il8E*yuZelPv_
z-hMpTIQ4Jo=OkKR-!pgGPUJY1!~2*xxlaby{hxxSZoS^e<lhlzPUFp!_|w4kd3rjU
zdgA3}ym@lIoB_6v<U12=zVe(p3r#&fXE#3Oxp@wndgkL?uyN|n#|5;R4{hgjT+HEk
z#L0ajxbFWVG<C=0eO&H%%xS!N5`PKU__DU2qp2reUdEdz^KmKIK0K%VtaTaKIZ{s_
zmxJp$zXDA?K36tA<#}}#n)?0CIde7GICbafI@-*Uwre?V<Zz7Q<h~wU_kRPLx?{YC
zR_++hX}oz7{|m73Wo<X1sV81u#+xT|bTimKlJ6F9J?FQgsmJHG#-}{5ZbwtkeEbq@
zoVxRICvE0K+pjq8;cz_S<h~1B_kTB<y5qTnR_=JrX}oz7e=pehvbOus)Dtf+<IR)#
zxF2jEIcFXKJ4foqKS=9&Bz}lid(T#Ps%W(y2Jb+tkGACeHMl>AF;CFSWBUzw_r~@l
ztvt4;!HIhcERXHC;KV%xmdExScmRj}JxeQ(?RVh)8{6}=@}B&qg7v=u_WRP0&_2rX
z7{}ur)@*IoU|;4m)|`G8{yo?j`&)-r9@~pxb7@<bRvz0+VEfS4lU5$vAHeplZ9Q7K
zHs7CK0grBMFVo6>?{W;Uf-j*}_nh$a<m+Jl%lEE7!qt6m@xAL$V71pc68C3t;;cii
zU+Q?Xv1@w+EKeP8fvrP5`Q8SbuY50j2d<tv-UX|bb-V{poPEjlOC9ewc5QzF%Tvb(
zVCzs%9Up=-E^|2kzoMy6Y3Ac^V70Q2kKl>3FS&lH<L`}K+s9ye>i7rPI@EvQ6U(|B
z{%ink{~ZfW`{#;vi1o9p&nCRKea2{h{Vr|vAJN6h;k|t}4qrYeOpaN>=J1(pa$Esc
z55KbEx!$jWU%{bnKl1eBGrqYwe9g&`e&zz(&pfordo5T!{JM%?-|&8n%)B>n7^9yy
z`&LWe9nV@2c+JO=zI~2t-#&*;UpIl(!*8zmEfv4D;r*C-`?-z79LDLZ-FnnhkLS#y
z9KIIjupY0qML4WyF<SfA?+y-ibBN0vOTm5h;7E?e3%o>umjs)8Y1-tz3#@<m-C$!5
zr*D7Gzn4QT_WQu*iT!@C+Ow>KwLZXMPUEy$vszj6@^D{e&C9@3|FQ*M4s6XU(58lm
z!TN_k0yZZ6F|cu|@o}))Gpvs_KEYuQ;~wSEZk_6>({pMy4qq#ASlf!UD;Ic`0`t|+
zcAcx!T8Ht!;m|+)DX=l&PlJt{nY#3UhGQBI$D*(H*nSI+&Aj?rvwB(c+HhZG&HnN(
z^{-jrwZP8pI<%?jd9eQBzXKZ+{sP#z)b)F?b$K4@t39?C!LgZFUu#xR&E3I0Iebme
zVV%>_uFGL<>(TmLL+up~TknrzpEG%XG{>tPGjga~tJ-TE<~gzPc^#~ux^>9QIyQj&
zD(hGuZXFxarj9qk))D@8!_)7d+qRB6^?idQb-o3*PIbp&Y&nj1;M&z^rV_{TE{F5z
zHDGRYyu;xbw3TDn81Aba!$xq&unBFR5q$v8T#Dc4utu+s4>=a)P|ukD3O0||ls<ps
zP|N$>M_}XB^|9`cIn=FFT-Mne?yIb`7u-5GrA?imf~_<BtA;yw+CJe(-JgN2TRnAu
z4mNM<{sOFKyt%&QNc=y+#;ZTx%*EGWW3*+g-+=8)TgLh=SlzLTGuA%f%{hE+#*uqm
zU$A4{f;MCOH@F<@4;BBh;+_<x|FjkN?{P}Ie~(jg{~o90{ymP|xpR&G!(pAym9@#m
z*5$m3t?7G?o<th+Uyh|X^w&>&@=Zhj<g<3UIjq|pVrwq*b%mQxfBm%U=UB8Q_jKUo
zZjY<+b4(p;+)>{Q@a0KoynfpA%w$He_3Pt0%CnAsM%sqM*H#>^<Ce5rbGVM%()#((
z`J0tE^?Y9L4xSB7T_2w*&cor)S^l0W=jyzTejGl=%m+74J-O!xo7*uZZUHp)<X#YL
zZhuD=pM}8c$-OYxIQ4vXTLf&M+8n<;<DZuPcHr=}9Y@B$eS!Odos0go_UoK2K@NRf
zv&F&k*p>ntqispBy!>oCFSeaIeC^1QK6e7!&n~oHJ4+L{3`fpo*JY)K+n2v5Tam-x
zk)`jI!RE1VedXQ<ta%l%YpuQ?D`6e0g3Y1rQMhsTscuevRs*X!cjC;Qw%s^<mE+hI
z?l^X*%{csfu5uh}H$3B53to<69k_X{M_+lyu`bwgsAnAOfz6@K=h?>Dr+UV*K3L6h
zh|6*8Nj_iYIQD=$j=g9zj*W;b$FWJnGmeen<v4o5&0{_K$}^7MV8@}Jar6P3L))et
z#@VNO#<7_?hvN`u9G=SuaQNDn!}Dq%+WiVVpuhtQJgC8rZE%5y6nJQXhZT7L0uKi}
zX9v<cXXe|I8Z&3M{VKj|#rLeZf0I|n52^U@ijS=L*oq%gasMW-%ztFXkE{3z6+f-w
zXH@+BieFgq%PM|F#jmUQ4Hdt&;<s1)o{HbsaL*_2$6LT%^YE?UuBqpqYbW=d)pzSQ
z?pV{@bGJb=hqink*%qwsy?Y_zwgc}+I}5FO<@%)0?ZKXF;XA-Rm-O{~k*9|KZQOAl
zrG_2R%%Lqc>;zU%4LgIaA@A{WeeA<?a2JmBu^V_seAQFa?qKIBeeMC*Pdz?+firiW
z$MWRdyN&t%7Hx9u12&H~>+oDJ>(~!&9qRdg&j7G-+A`LGVD*f3uy*1G(VADTkJ#&{
z>|-e0KGZXgVPMxIb?y(>Pdz>dfU`zkgYt~^z&6HCn;av+=Fw&yBWcSzM!~H^J!2gW
zHcnf{ItHwsu^t58kGOo+lIxQ?$AP_f*{AmcdFmYB#{3?PHu)!j)sz2V@Brd-?#lH^
z{)ymWO@8k!^5j3Xjrsi)ZSqe7t0(_q-~q(vyp-!>z22K%#+H0Xz?~!Y^nVmM<MZAn
zkN@O0=J#l{S%;&+=Fyg#rhwH`(=p%y^fQRoI^_C@y&oRSkv@(GTa$WfIuV>X@qQ_f
z|4D7k@1$r`)5&1-XiH6}fYnpeso(+hlXaHsV@;>gmNlIYw<h(}bS60SbOu<Sn$But
zeh);On$8BBM_Xz-2dti&&IS8EnsY<0k2Rf7dmcynxBzTT>Z$1>aOUYkusr@3w=usL
zqfJehfX$;VHT@i{o|-NNTT{*_xjxo(8Ett!T@JS<_0)7FIP-J`Se}}$YU5T*JvCho
zHjlQ{bPZTNHC+p~ra`pUA=gKI9c?*J*TbzzJvH43&OF@!mZzp)v@y?0X;afpVDo58
zO*ezpQ`0SAYs&c~*T<S}rF{ci`nU~lP3o!Xm*C9P?O=KQ?`UIw&qbS>eg!s<w$yYd
zSUok}1s+5{IiKYESkpbUcXOnVd%@PEo|^6lXP)i@%j5q*8}nR~HZ?s6HjlQ{^blA*
zH9ZWrrkqc5ePVy4v1cCT)*t($jXi5G*IxciG%fijaQGU*;b+$sX-77AI@(bU_UE9{
z1s+r2gBq;=*aDAhusO$b_!;$JT0f(j?=k8zCZAc~s`w`r|E%I)Rs5TZe_wI;DJc7M
zpMsLRPl4Pyw0)d-$Lzdit)Bp^$LH5z=c4p^5=}kp{Tr}x>WO;_Y;M;#xt~T;kIyq;
zbC*89MN?1iXTip)AKiRjd=6}%+LG&eu=8!5p4W2yV*eeu9_tHe>hbwKxUT0#H1(sJ
z&$Tas)l%Odz~)g;o|nP(*k3_YkI$>%dR|{cQ%~;K!N#c{-HiQ@VEfdTTz>*P_FPMH
z{harincqKyJ%_{Jfa`BR+TR4L>F3;OQ%nB0!I|5v<vVEV@p<=u_`HXvp7HzzY@B-H
z-UpkzJO@62tH<ZV|KamjH1*{E8`wDYqno+<2yCC)9GBOhTE_4(SS|eTV6SPP$#<vy
z2b_QHXKMOtPabVzuh~yIazFeW?EO<cdA|VLPip-Vte?6$yv~p1NREGk&7q#Ld<`~E
zn=zlzs-=%_!DERD{};UM^E<dc>c)HTDEs_3+&<OQ=YPP)X*0(Ak6QZp0c;)NKQ=t$
z{4ZQhztpT2e{PrUoZ-_nJn_@Q)$~jMYGUtoT{tpl-N4qbo_yWG)|Y(K!Sz!&hxbRn
zYn>c3g3Y0xIhYA-oVJWXEq%-aF6UrYxbu`gXM^jbZoKzkHRI=?ui3%AcO>5&aD7ha
z9M(Q3SReJo%mwy6%o_EX8?L6WeP~ll&UwJ*44)V5`(DO4A6!j8<F%>P@1@71`#O{(
z&le{(*!Ppe3jOc~PozDfz(*GNr~*%Juz8Ly@RR}{Q{ZC@d>q*K-V<o^Uh6(Q<_dQ&
zo|3y4Psx|4xO?%GcK6~bxqI=H-2He;?w&j)-?`%M$y3_hlc(edReVCl-J7TMcW<7O
zPpSA>4R?K=i}G{m&)}Z->e`(<HP4qr&=&-|hTmda80^dMC#o;Rq2~OHjdyO#p9dC&
zn_s;b=d5)u2KF48rMMsTKvU1p>5GGnRX49|ROVe0ZeI1|T?%a8^8I9KH1*_N25hW)
zuG3|~_NOg#pyqeCGCteP$#P)NW#?oCurKFCeR&Qw=R};^Rs`2$UI|UzK0W8k@vQ=P
zeCo-uD!3lsYG~>i-|Apv)ib^|!1kpr<5P2dsns^)TNCW{>G;+L`*M8hYjLPKKC$_{
zKFZqHg<HFNYVQfI$G;w$dTL)EY^-|XHUQUS-VjYa<J$;qta`?$=J-;pZN|4Tcrx`l
zzFuHoj!%6P4mHQu#?3j~8?CN?Q#AF|+Nbd;&%e#k)KhC;uyN|CRn1z{udTKEIb;j4
z_q*^d;ob`~XIsJbQ8%CWCbjr)1J*x$TlkbBeml56>c($Ps}}!$U~}es;5&f1Y^jdt
zrG;OAu(9#mvGCKD&s94$`m)5EOF#Sdo~bSU?gCcR=KWKi_v&528xyadShd9N4ld_x
z5BL=PGiQ6k^-(w8`>|T$_X4Yh?+u=xBWt)1SReJQ{k~w|56gSnerW1Pa}K$d1Hfv=
zCvG4(an>T&FMSUJTTAA3FgWwCEq+75&R1$23f5QMyu)bKGKc$vH=ri<%%NKH4F{Vq
z{D6ih|ABBd{SvPh{}JFaek8n{!%=X3)Dtrr?B|Hg*BH2(zV@L_E&gM{zW;}hgO}IF
zc(^|5#veqh7XO36<+X7L+|M65PbR|kQ8#`9ty<zIfz6q<ISib&(H6hM!N$h#h{8`>
zu8kub-D|^K`q}THwAzwyGFVO9QMB@`>(OAZ4fVvTCH5F_IcLYh%P}7Z*GJv>DYR;d
zKOU?Wege2$*AwCTsOKC%3G6vu-WN|sQ_otT0#-9Vai@Y4XDxF5()Vd#YstKx4$k~*
zi{BYw=PT##nP7d@&3hKDTITR<u-Arq=1?v9&H<Y*{M?2o|9NmV{SvPh|MS6R`~~oG
z4lji3qn?<Hz~!}ZF<ebw`_QIVe{Me$-PcJR>)_|}s*@XhBJC*+_V>D{7WlLRpI+cI
zz`i%0MVn_`m(XuMfBNj}=8E4@@y9FvY{g$L_|h!sn*}%jTLsttZo##`S8(kg6kPj<
z1=s#j!L@%}aP6NIT>Ga5*ZxJpwSQS~?OzvM`!@yG{$0Vfe_wFzKNMX1j|JC0O_$E|
z?;em@5AEFwZhZHGYoDRu+T8=PjCT*na9j6)EV+9?hMUhlAj56l1G41q0U56UQUzCc
z56IYU-2*aQfA@e4w{;K5lDh|F$=w68<eOA{(~9@4_?8uSU&u0_dqbAIf5qJ+vb4KT
zWXattvgGa;8SZ)vD7d=&M#gSCxZ*<!Zv60q>+jx?iP!Glk>UQ#?%t6lckjrOyLV*C
z#}-`Oy(43{b??ZMyLV*C-8-`6?j0F!eUl5W?%t85{n&zA|M3-f@5uOTckjrOpHy&l
z_l}I+*1aQ3?%t6lckjrOpIvb4ckjs3?%t8%+TA-c-1^)*vgGa^S#tM|Ecq1$SHH62
zS6BSnio17Y^4Z>4aO-pL$kOiKk>T3!D7f{zcVz5x_l_*Ndq<Ysy(7bI-8-`6?j2cj
z_l_*Ndq<Z1(SlpwlLgoBnSxuNdq<Y>FIC+ABTM@qEAD=hrTx8%e^7Dvmn{9ithoD2
zmiC$WVJqD6xW8n$t@}%s-2Ej>?*5V`cYn!}_o(>d6?c!x(%(HMOTI$E{e0m*ld;SD
zRNQ?gOMBmnyU%24A6jwunJn$@Gg)%?nGCmeugQ|T*JR1vZ!+9^rxaZM*n&I%$5-6_
zCKIpS{U*b0PpP>3O~!72?l)O-_nQp2b-&4Q^Iuu<t15m?!Hv7F;_f$@eD?2tlO=b*
z$#7fun+&&q_nR!a`%RYI{U*b0AFH_gO~!8j?l)O-_nQp2b-&4Q`***|lDpqz$=z=<
z+}8ajOYVM?;r9Pd#occ*c3bzG47Y#xn=HBeO_tpKCc|yrZ?fd>HyLjK?l)O-_nQp2
zb-&4Q`***|lDpqz$!9OPy8BJWZtH%N;r8!-lO<oW;Og!-8N2<v-(<<%Z?fd>HyLj0
zev>74zsYd>cfZM!Z(MNoUIn**_nVBryidj5Z!&gU_nR!a`%Q-1zxz#=-2EoQZTBj;
z{kh*{?DBmpKA_<Gx!+{`ZQXA&-2UBfvgGbJ8E)%-li~K~ev>74zsZsxUT}5yn~dFd
za>4E2{U&3VpILBq_nVB}{@rh~<nA|Fa`&4Iw{^eClDpqzxc#}`WXaucGThevCd2LD
z{U%F(d&S*vGIm?{n=HBeO@`aQ`%Q-1K2mY_n^gB_@jkRa=jShfZXN@lnf6k+n)bcn
zmw`{E-JU4@FNdpn_~J(x{zjnVZ-}ly)9x$Um5ruf{^sH;G<~)Cy&C(y8m#Vf2l+K%
zYf(=v*MiGhu7j(2@UO*3jj8=!kD%RGYPq4&%35wj(^p$+`2|?r=Q-AL6WDsxeLmqc
z0yXQ>ehXO5XE*X&!S*qIV>8C*2<nNs4XhS^JJ=jPuQSIl!D{-YPqp~}3Y>V?=}x%M
zIkHZ7!PPu`mFsjjns#5An|m6qoSS>m^wpNRxeu(Kb-EvHE$Y^0Ef0XpS{{U}dGN2(
zIz5D--B)UPxY5d59zoMrTWWa}te$mx3~W8>&Y90@)G}vJfYnmxufaYK%A7q3SM%^y
z&e?C!wEIe(Pc>Ru=hJBVYD=BZfYnpyZ^720Zk;|uQcIoBfy+9dhnIE!4zA|GzfSY{
z0)lp5sq^=ZR@V6<n!egn=SyJq)cFUnwWwRC&$iT3=PTf{&R5|+Q_Fn523Pa&RnF(@
zXxe?H&ObI<S?8b7^wpL+{|r`7oo|4xMcq1m=BAc9-v(Q!wzp{IJ|FdOblw5`_eSd4
z-=tMb>|el%^;w)evG0R-BUatmcWKoU`&V#cKLpDY`#12O#Ht(n0j*kM{|@%N(DpH{
zJhA@(TeG_Mk7(5r`x!W~pMvFy{Tyt~>c)OTtCrZWz={15EKlq|!Pcy9>=(3ZiTxJr
zIil?wT6tps1-52&?O)TXCHCLo#C{K!C-y&JYgRY*J6g5G{ui9sAHnj(;@+-V-Pj*!
z)e_qU?D?Y2XN~g2b_H9rx^|yUswH+haAMofA>oOg9&F9(#&)AsOYBVG#QKa<p4gee
z&a=9)GtjCT>p43s*z-l3&o6i3f=TS`@CEQu_a3eP9AIO#CC8j#&vkS7%(KifH{2ZR
z$uSSu7;VWhFW7V5>(GAYgX^arZGN!*YqKw(zp5q2&%mBX+I+TJ*0B)WI@D9g!eC>x
zCC4IQuK{!TJhsfS7~CA{$<YIBjJD)h9PG7_x|e|Krygxdu>EVZFP{yoCCAcW&o6C0
z_buyK7H%Etsbe{?G1`)2d9c@pIeb=J=2#JK4)x?%32cnE<X9Q(HIlklf$OIpZB?-S
zYqPJVXw{Nqb+G51HlInCb*u@u4)xTr7T6eV$+0%rYsMVw(3UyYg_}b?IeLPP(Uu(R
zfxUK8_xfP{)T3<xwtsE*<ui4)UYwVH*4+r~c~|~Edt<n|{ytNe$G;a?|MGWyy|r`b
zzX@%dH~IU(Jr~OSo59uf-;`FK{F{UIFY|8!SJ%HUtvvo)f%PB6b?&_Q`}eKk>iTa<
zE06!SVEw<xFaF!X)%D+oRv!O;VAsE_e+Rg_{@c^a<G&+V|FZs_;OhGKr<KQl7qI?i
z{ky`|_1~FR9{=6J`j_?Z0aw?5H(Gi8_X4}-W&XY4>iX|VE06!aVExPd`@z-q--lK%
z9!UEJ^XUE6&*g*Qv(c(+A3&=n9zq*m*Jvo*?+&SJA55#}nwaYVu<_c4)5^16Bf$D=
zJCIhMxKZFsntQ}Zuw1{<w8I)ahIao3A5`G64NgDf;hz7-kE3n#<{Ud1?zy6E0<Ao8
z6TzN$+76+W$2JM<8fiO}Rvz2oV9#%DhtbM=vfB{DX&nJRp4Pc=%#O`5ITpuY?bc{5
z_GxZ&nA`b05^SuwomYAKJqm12ZO*GaaYuu*4wJ$1#2o|9I!pn}6L%aq>u@YsZvNwG
zGxsNevnD4N`biB=Kc~Pq;V}MWT6t`zfiu^qg5|ND0nS{X4wlDu7T9acoM+O?V><`D
zePcVDR-Sb@7wkGrrghAY%`rI^$6)Q&Xf5_>ZgZI1bvO@fto=H#^7MN?*qqv&S9#(t
z1ZN#C0Lv41F*xgR5m=tMpM$dwmw@HwzmzuXa9M#b2WL&LX!P`R6+Cl&C0HKYHQ-G-
z%zZVjJhtn=Ij^q;%VWC%oVmUpERXFMVDC5PypdL(b+`%aIye`O*|9k$$Kn{Q-5RaM
zKFw_obGr^VgN-$}^D0ljw}8#5&3Tn4?ly4N;a0FbalZs-9c~B96Zb1{*5MAY-28Xa
zW*zP-@ZANz2ke@ppZmZ$ukQuRV|xIcxxODPkL@9F=K4XfJhn%`{W+}ZVOn`?kAe4W
zY>(2)vks4gT?gmFF*`QL<X9YowOgaL*r&P8VQ$yq39zwQ2YLGaHQ1b42YKRt19lz4
zpK7?Dm(2S#TpxAgpQKgG``mBAYUTU-vvBqJJlFV?@8!><sUJ;0-Rb9dVB^&7=NVeH
zjN$iSHEl1^zR2PGrLlhi`yQyCy-{BV8>7vb7iiUz?^ST}y#kge-)rFB<Wo;AYR38<
z@7KZR^ZT%WruF6bVb%Y{@fL^Q`4Joc25tF$-#6jbtFHZzv}%5*)qKX9BelK_PObXN
zOJ8H+`wm#2)Gbfl?}AhJ`(R(zt^OB|zjD~O*!T}<%ep^=TerIQ_h{9u#eBw^BkTS*
zu;*5JZhQn+PrSU0H&4d$G1xwm@9*H`GhVJw{Qm*YJbeO|=QGu(U_Vo->+iY!8Q6R|
z*FOiV>1%#%YRUfvcyMF;k~TkUcn*FAcW%_pr}j^<G1|;$tXlGY4OR>Prs0YI7OtjW
z;?*)A-+?n9$?-j!dVK!<KYad!rk;L&02`;CxF5mhF0a}D!qwx$H2z;c)1av*_q1T+
z)Q@iFr3=_TwIx?qa6Oi8XzKaCVt265?Pyxs5<4BZu624e^~BBqPOP?khjhk9_dBHK
z*U!23I|SNtPna3(_e9k*wpqZAMVm4IVoYjczhh8dJF~+*Kh(2UbAXM}-|rxl{&O|{
z>iK@{++bt0CC5Bq&!N;bFI+$M%=dg?``2b)e!rpYYXP{mscZN94*vHsGWLZUyL$Rv
z7+m(d2wXq)^t&k7+}g4i)M8-k(Pm$MH$zRlIBj`NEdjSKb?r;hF3FL)mTv6oscRW<
zS=X{~{nS&}a$s|7Gj9)Cwd7b4tftNHYRJ>)N?^|eb?tt)LoMgxDqx=zl|Lu0il!c)
z)xbUnDt%T*Q;*LYV4qo)K5L?>XK&KAz~(kqTVmG+`+TST*>fE<_4up{_W4ce(-TcS
zKI?&f)>QhekEWg)HUOL3SZ#^j5bQjZ_m7Rx)Z?=;*m)>@HbGO5PcN|ZQ2O-N&XF26
z1)JMgZLUinTIWZ+8Lf4S`_k&~zH<IO;O5}9Y4z7nd-~o2>^zq5Ut6N7$7d_B^H}<9
zjiw%-ZNScB>9Z}Gdd9FF*xbfy^ZsOg+k?GtE!EVjpW|4WR$Kbm0bHMV{n6Cpvm>}Z
z?{-2{kI&BF`n=l(O+B^l3O2W~+7i1PxIXW8M^lf_9^m@C+Y?PaK6`=d^KNf6_0+Hr
z*xbfyOYFYj`n=l@O+7vX!1Z}I5KTQkgTVE9HyBMlH4Fip+gNR`%TQY9M?8$yI>r0b
z>Ywv&ICwSIRDb=nr|$#6^?7$7ntFUjfa~*aB$|4BMuF?|ZZw*D#xMqKZez9OygLZ&
zdFT70wd&_M`q63=kE6}AD9_dL@VRN#wU4D$6CX?)Uq2fi0{7mou6+Wnn)hHon;i=F
z=e+!`dJ=phhkC|$82HWVx;h+9Jw8W({ry3C?H!4x9-pJY{;s3+nT)2Mc{&<wZez7s
zr(-?_>^L366k2)SM~(wKE^Wus%EiaiI-ZR6M6jP<w4DH!$96J!Rt{|^(aMwmRB(^R
zb_%ULw$s7OH@4Gg<>okp*0J<tmKk#EOz;F+Yqv&gvj+Pzr?JME*YTbO*4JE)Tb@48
z2Ae~h<Cbf4J<kP~>v<mh91iuY!}(y>v%J?|fTkXw3&E~u>2nd9dVDShyPl=bC1~nd
z&!2<MZLGG8?=o=K^HQ)p<GTW!^}HM`cdS>^I-ZR6YH-%`DzH4ZYr$F1Yryj4zaE_R
zybdgn?M86c^9HcozJEdMShAisfn85)w?=ET2KzFnvBsF!^}HFZZ`M<uK5qe=BkL*G
z=6c=+F4yyR_^ll3S%+VOy%(3))g5T+@%a_ldvWP=Cz^VE?gD!+E`9DsQ_p(d12(s@
z+A_ZTz**0G!Samn0dUsyez4rJK1l0$GS-K|S<i>S^4J~)XFVSQ%ai|caMtrNuspV3
zgR`DbfaUi6B&}n~dj1CNdRn_RTAMZ4mpP3!#=NfQQ(%3wp7QkhG}s(jPq{YN^S9t~
zJ)ea?!=avacn(~@w?2=i9-rTV>-W|d(A4AedvN{U`XZWo*7GH>xsBD9@x2Vrdj0_{
z&-h*iXFXp5%N^@$w2miZ{UbQ*`8rr0+n>Q%&p(0X$^Rxe>-h#)9^2dCtmj)`xqZJw
z>sYd$?}A-VYqv&gvj+Pzr?JME*Y$i4tZ&v+o<9EqHb>S|uFduQ09>x;hw%3~)UytM
z1=pV$|AwX>pO3)xXU30>D}4SAu0J#W15G{a`3czE#%jy>J_BbxKLyJ(zAwO8&(Fbf
z$ND9$<H=b63C?<c1(wJ54LIxhHCUef{{m+{zXi)<`yQP2{0=O)?|;)emaON0z^<pY
zTcfpEgMFFPSYyoVdj0^`H|r@+pFe`lk@b{ob3J)<(rG=Xf&Uk~de&-Mu<Kd=yw(Lx
zJw9E*u4n1f4NW~h-NCMB=`$Ugde(D#u(^%ZmhsI9&U*U2Y<b2vGdSz%ceLe>)$eCJ
zo{V)iaMsiBYRhAr1Dy5rd)xBlp9`Gz^gG=0*yaIeJ^enn+`j!Tw`0kA&Ifiqt=$@}
z%^K{>oW>esUe|Meu)bMOdHP%cY>up_T$}5;Ah^6f7lQlW{#4I8EDWxH4p;<DJwA(q
z>z@M_LsO4W4{-f+z~X4?S<fZF<~CMa#<vtW>$xOYp7AXM&U*Ttce!Kr`|ge>V_hDc
z_4K>%^4L}cXFXQ{%aea)aMsiB#LHt_6`b|-`|)!7_Pg<pCF{95*!8q_YqU0NurG5O
zYm9ka&o#jMW<BNUb4{>0vYv8nuIJj|ay{39uf?IBbyyeddX}F{d!nhwXFag%S^BJx
zrXHUSz^-TMvmu&#)^j7UxsBD9@ofUmdTtDsXMDZES<hZzxntdw*70Pln}M^QeZcbA
zHV0=t`-0`kza=>9xdm7r+t%Q$=T>03eQ#sSk@egb?0Q<eHCmfB*q1qtHO9QI=XPLy
zv!3$wxjon%Sx>n(*K-GOxt{&u{W#RK4m*Nf&+-}FPH5`!*%|D5mOi_nsmEtmu<Kd+
z?1rYE_1qn7Zez7&e0zeko_m1h8Q<REtmj@}xntdj*70Pl`+>8b`-0`M4FqRB2Y}_t
zKNy_#90ZofHWZxo90Hcx_b^(=lJ(pl?0Q<eHCmfB*q1qtHO9QI=WwvTSx<TTJOFHt
ztfyR?>p22kuIEVjfgI{thf!eHvwYthjiw%-F<{rT^f?GkJw9W>u4m~p4oyAlIUa0o
zW3^>`2ZOVo6TtF}Zz4GBc?ekUSP!LjJQ?d@;H>8)uspUSz**13!Sduk3Y_&k5-gAH
zXmHkZGFWckQ)nGa*7F#!>uK%QXl>SDU*<H{81uTG$Ab0Eddk!1abR;~J>}Y5&lAAq
zdY%YBo<lwBa1wYLo~xAC)yZh;@i_(Dt@1e)O+7xRfoG_EPDfMEdY%C`x3StXzO%qt
z&ojaDjPD$9*7Izz+_9cZ>v%HO^TAoq^T6`hE(B*iF96Gv|6*{~^CGZ3wx5Huo|k~-
z_I)X>W664626jEI-5RaU8tluQ#u{T@*Yk3)zFALs`n&>cj;yC#o9lTMxLnVx;a75~
zXC1BqyPoBBbuF5De69n#o~6(AXzKB~0qlB~J~yJNXFY!bHn*|bGQOL^S<jom@{I3R
zaMtq{u-viUM(cPo)?b3Np0|VLvHc31^}GWtPyV~WS<gGc^4RVHXFcx*%kBGKTE~+0
zybtVpTDvt`n>E;%IgK^Oysqc{V12Wm^7Q!t*c@3;dEy=f_iKKBG0#Krl?(nb+<8o{
zN8y>fN5Jy@&h4=_?)W=1`+6MSp>^!(_ypYN&g%Xi)c$@AwhwK2ru`&1&$O*apV)rW
z*p?#J+V!=sli}LRePVomeJ+QuvpL)+#=o;Whr{ncoJX6zUY;U0e+%b%{xp1Xig4WW
zXW+(Jhy1s2Yh0B>U+wXI7QAE;`y9NCeIA}zeYNNL`R~By*2i&Kms+lg7r<)eGq&Hu
z)#LMG<5Pa_d<jiGzkmJ%*f@3n`zZ4K_fZ_*1suM%;>f(64|d)zq|Lm&N}ThSJt_ZE
z@oyWR+Fpa3r>y^VH1+uWvGFPE{}Y;een0(ZuyN|nlXa+>V{Rt*P4JwwZ_wu4eG9A}
zpLZIcw+o+l!Rq;b(tBXz)cqdO2eiKYK908cIsVS!yoi(gLvY>yU(wVv&cA`JyPS)U
z;OhA<!N*|Z)E%eutLArcti^uqKRN#au5*5Zrk;3t8E>A<(WhYhNWRa&$!EM=pN#Vh
zu;cukHaWfotH<Y`jn7ww&(~n}%=0&3<J6t!?`Sj6+Wy7Sg$vs8ij(_$u>F_0|Ba>|
zpZ_#IWsN_esUOYx;(g*ruyN|fc~1Wq?AXimP-pe5&oppj)uT-dR`<Ib&Y9PMnse^B
z9D`#?KV89fKi$yO6E830&6D}=j?F%jZ#sDL886pIf3NN78@t!NIM+QNh1>s)S(?^G
z9A5YCrFAifHUFG8*ZrKt<+@)HeW{ACUvP75P;mF;+OXmqSG-rnH!ZmL@U0r|m=}W2
z1h>vyOACT$MpO6inQ2<H@PBHFnH6kI*4LQX(9{z%J6J9EojJhfQFm^g2erHh&IR^8
zNBvG}^LvPMgVp`Ls^|8+U|-%tw9Uhz<{XP%r}=1q#<2iL>Rk|Ak8vS1b^jjn9&#=W
zR!htx;ChUUqNyilF|e9r>_NLYhk4Wok=HuZmf-OH)jF*=eJl;G>skg)J@dFMSk0K_
zXqV?$oa;901#PX+=(&zogd3+mk~&<UmB8xhdu4DvzE#lF)Ay=iwe-Cj*uJyw(AMhw
zpL+UU18kgn`d$;P?mYD&W-YMwsCzDWj;pzT{_XQd;B^|C=es<%O~C&5yp8kRm&eu{
z?0?T&o7aInwm#tX8k^UIyeGe*_uAbI?7nzjd)8u~<~E1E=I}FDU$8#r_qvs9^PJn9
z!*fpV7`FgBzs{xijxFJ8#(E9PJ@>V34fb<``Y8N8m$u<hPv6^u?bE;WrD<))|Ec+k
kwtb_qL{qhXjpjJ>-&x)P&3OH_d%yL3bqw~aJ)Z~v4>v$_kpKVy

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl
new file mode 100644
index 0000000000..785bc0c83d
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl
@@ -0,0 +1,3904 @@
+// Sharpening
+#version 430 core
+layout (local_size_x = 64) in;
+layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput;
+layout( binding = 2 ) uniform invResolution
+{
+    vec2 invResolution_data;
+};
+layout( binding = 3 ) uniform outvResolution
+{
+    vec2 outvResolution_data;
+}; 
+layout( binding = 1, set = 2) uniform sampler2D source;
+layout( binding = 4 ) uniform sharpening
+{
+    float sharpening_data;
+};
+
+#define A_GPU 1
+#define A_GLSL 1
+//==============================================================================================================================
+//
+//                                               [A] SHADER PORTABILITY 1.20210629
+//
+//==============================================================================================================================
+// FidelityFX Super Resolution Sample
+//
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// MIT LICENSE
+// ===========
+// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS").
+// -----------
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+// -----------
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+// Software.
+// -----------
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// Common central point for high-level shading language and C portability for various shader headers.
+//------------------------------------------------------------------------------------------------------------------------------
+// DEFINES
+// =======
+// A_CPU ..... Include the CPU related code.
+// A_GPU ..... Include the GPU related code.
+// A_GLSL .... Using GLSL.
+// A_HLSL .... Using HLSL.
+// A_HLSL_6_2  Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types').
+// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan)
+// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default).
+// =======
+// A_BYTE .... Support 8-bit integer.
+// A_HALF .... Support 16-bit integer and floating point.
+// A_LONG .... Support 64-bit integer.
+// A_DUBL .... Support 64-bit floating point.
+// =======
+// A_WAVE .... Support wave-wide operations.
+//------------------------------------------------------------------------------------------------------------------------------
+// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'.
+//------------------------------------------------------------------------------------------------------------------------------
+// SIMPLIFIED TYPE SYSTEM
+// ======================
+//  - All ints will be unsigned with exception of when signed is required.
+//  - Type naming simplified and shortened "A<type><#components>",
+//     - H = 16-bit float (half)
+//     - F = 32-bit float (float)
+//     - D = 64-bit float (double)
+//     - P = 1-bit integer (predicate, not using bool because 'B' is used for byte)
+//     - B = 8-bit integer (byte)
+//     - W = 16-bit integer (word)
+//     - U = 32-bit integer (unsigned)
+//     - L = 64-bit integer (long)
+//  - Using "AS<type><#components>" for signed when required.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops).
+//------------------------------------------------------------------------------------------------------------------------------
+// CHANGE LOG
+// ==========
+// 20200914 - Expanded wave ops and prx code.
+// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                           COMMON
+//==============================================================================================================================
+#define A_2PI 6.28318530718
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                             CPU
+//
+//
+//==============================================================================================================================
+#ifdef A_CPU
+ // Supporting user defined overrides.
+ #ifndef A_RESTRICT
+  #define A_RESTRICT __restrict
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifndef A_STATIC
+  #define A_STATIC static
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ // Same types across CPU and GPU.
+ // Predicate uses 32-bit integer (C friendly bool).
+ typedef uint32_t AP1;
+ typedef float AF1;
+ typedef double AD1;
+ typedef uint8_t AB1;
+ typedef uint16_t AW1;
+ typedef uint32_t AU1;
+ typedef uint64_t AL1;
+ typedef int8_t ASB1;
+ typedef int16_t ASW1;
+ typedef int32_t ASU1;
+ typedef int64_t ASL1;
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AD1_(a) ((AD1)(a))
+ #define AF1_(a) ((AF1)(a))
+ #define AL1_(a) ((AL1)(a))
+ #define AU1_(a) ((AU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASL1_(a) ((ASL1)(a))
+ #define ASU1_(a) ((ASU1)(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;}
+//------------------------------------------------------------------------------------------------------------------------------
+ #define A_TRUE 1
+ #define A_FALSE 0
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                       CPU/GPU PORTING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Get CPU and GPU to share all setup code, without duplicate code paths.
+// This uses a lower-case prefix for special vector constructs.
+//  - In C restrict pointers are used.
+//  - In the shading language, in/inout/out arguments are used.
+// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]).
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD1 *A_RESTRICT
+ #define retAD3 AD1 *A_RESTRICT
+ #define retAD4 AD1 *A_RESTRICT
+ #define retAF2 AF1 *A_RESTRICT
+ #define retAF3 AF1 *A_RESTRICT
+ #define retAF4 AF1 *A_RESTRICT
+ #define retAL2 AL1 *A_RESTRICT
+ #define retAL3 AL1 *A_RESTRICT
+ #define retAL4 AL1 *A_RESTRICT
+ #define retAU2 AU1 *A_RESTRICT
+ #define retAU3 AU1 *A_RESTRICT
+ #define retAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 AD1 *A_RESTRICT
+ #define inAD3 AD1 *A_RESTRICT
+ #define inAD4 AD1 *A_RESTRICT
+ #define inAF2 AF1 *A_RESTRICT
+ #define inAF3 AF1 *A_RESTRICT
+ #define inAF4 AF1 *A_RESTRICT
+ #define inAL2 AL1 *A_RESTRICT
+ #define inAL3 AL1 *A_RESTRICT
+ #define inAL4 AL1 *A_RESTRICT
+ #define inAU2 AU1 *A_RESTRICT
+ #define inAU3 AU1 *A_RESTRICT
+ #define inAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 AD1 *A_RESTRICT
+ #define inoutAD3 AD1 *A_RESTRICT
+ #define inoutAD4 AD1 *A_RESTRICT
+ #define inoutAF2 AF1 *A_RESTRICT
+ #define inoutAF3 AF1 *A_RESTRICT
+ #define inoutAF4 AF1 *A_RESTRICT
+ #define inoutAL2 AL1 *A_RESTRICT
+ #define inoutAL3 AL1 *A_RESTRICT
+ #define inoutAL4 AL1 *A_RESTRICT
+ #define inoutAU2 AU1 *A_RESTRICT
+ #define inoutAU3 AU1 *A_RESTRICT
+ #define inoutAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 AD1 *A_RESTRICT
+ #define outAD3 AD1 *A_RESTRICT
+ #define outAD4 AD1 *A_RESTRICT
+ #define outAF2 AF1 *A_RESTRICT
+ #define outAF3 AF1 *A_RESTRICT
+ #define outAF4 AF1 *A_RESTRICT
+ #define outAL2 AL1 *A_RESTRICT
+ #define outAL3 AL1 *A_RESTRICT
+ #define outAL4 AL1 *A_RESTRICT
+ #define outAU2 AU1 *A_RESTRICT
+ #define outAU3 AU1 *A_RESTRICT
+ #define outAU4 AU1 *A_RESTRICT
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD1 x[2]
+ #define varAD3(x) AD1 x[3]
+ #define varAD4(x) AD1 x[4]
+ #define varAF2(x) AF1 x[2]
+ #define varAF3(x) AF1 x[3]
+ #define varAF4(x) AF1 x[4]
+ #define varAL2(x) AL1 x[2]
+ #define varAL3(x) AL1 x[3]
+ #define varAL4(x) AL1 x[4]
+ #define varAU2(x) AU1 x[2]
+ #define varAU3(x) AU1 x[3]
+ #define varAU4(x) AU1 x[4]
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) {x,y}
+ #define initAD3(x,y,z) {x,y,z}
+ #define initAD4(x,y,z,w) {x,y,z,w}
+ #define initAF2(x,y) {x,y}
+ #define initAF3(x,y,z) {x,y,z}
+ #define initAF4(x,y,z,w) {x,y,z,w}
+ #define initAL2(x,y) {x,y}
+ #define initAL3(x,y,z) {x,y,z}
+ #define initAL4(x,y,z,w) {x,y,z,w}
+ #define initAU2(x,y) {x,y}
+ #define initAU3(x,y,z) {x,y,z}
+ #define initAU4(x,y,z,w) {x,y,z,w}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Replace transcendentals with manual versions. 
+//==============================================================================================================================
+ #ifdef A_GCC
+  A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));}
+ #else
+  A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);}
+  A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);}
+  A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));}
+  A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);}
+ #else
+  A_STATIC AD1 ACosD1(AD1 a){return cos(a);}
+  A_STATIC AF1 ACosF1(AF1 a){return cosf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+ A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];}
+ A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];}
+ A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);}
+ #else
+  A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);}
+  A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);}
+ #else
+  A_STATIC AD1 AFloorD1(AD1 a){return floor(a);}
+  A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);}
+ A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);}
+ #else
+  A_STATIC AD1 ALog2D1(AD1 a){return log2(a);}
+  A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;}
+ A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;}
+ A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;}
+ A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // These follow the convention that A integer types don't have signage, until they are operated on. 
+ A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;}
+ A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;}
+ A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;}
+ A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;}
+ A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;}
+ A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;}
+ A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));}
+ A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);}
+ #else
+  A_STATIC AD1 ASinD1(AD1 a){return sin(a);}
+  A_STATIC AF1 ASinF1(AF1 a){return sinf(a);}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_GCC
+  A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);}
+ #else
+  A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);}
+  A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));}
+ A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);}
+ A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));}
+ A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));}
+ A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));}
+ A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;}
+ A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;}
+ A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;}
+ A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;}
+ A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;}
+ A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;}
+ A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
+ A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
+ A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;}
+ A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;}
+ A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;}
+ A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;}
+ A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;}
+ A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;}
+ A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;}
+ A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;}
+ A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;}
+ A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;}
+ A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;}
+ A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;}
+ A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;}
+ A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;}
+ A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;}
+ A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;}
+ A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;}
+ A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;}
+ A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;}
+ A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;}
+ A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;}
+//==============================================================================================================================
+ A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;}
+ A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;}
+ A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;}
+ A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;}
+ A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF FLOAT PACKING
+//==============================================================================================================================
+ // Convert float to half (in lower 16-bits of output).
+ // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+ // Supports denormals.
+ // Conversion rules are to make computations possibly "safer" on the GPU,
+ //  -INF & -NaN -> -65504
+ //  +INF & +NaN -> +65504
+ A_STATIC AU1 AU1_AH1_AF1(AF1 f){
+  static AW1 base[512]={
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
+   0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100,
+   0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00,
+   0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,
+   0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100,
+   0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00,
+   0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,
+   0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff};
+  static AB1 shift[512]={
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f,
+   0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,
+   0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,
+   0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18};
+  union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Used to output packed constant.
+ A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            GLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_GLSL) && defined(A_GPU)
+ #ifndef A_SKIP_EXT
+  #ifdef A_HALF
+   #extension GL_EXT_shader_16bit_storage:require
+   #extension GL_EXT_shader_explicit_arithmetic_types:require 
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_LONG
+   #extension GL_ARB_gpu_shader_int64:require
+   #extension GL_NV_shader_atomic_int64:require
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_WAVE
+   #extension GL_KHR_shader_subgroup_arithmetic:require
+   #extension GL_KHR_shader_subgroup_ballot:require
+   #extension GL_KHR_shader_subgroup_quad:require
+   #extension GL_KHR_shader_subgroup_shuffle:require
+  #endif
+ #endif
+//==============================================================================================================================
+ #define AP1 bool
+ #define AP2 bvec2
+ #define AP3 bvec3
+ #define AP4 bvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF1 float
+ #define AF2 vec2
+ #define AF3 vec3
+ #define AF4 vec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1 uint
+ #define AU2 uvec2
+ #define AU3 uvec3
+ #define AU4 uvec4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASU1 int
+ #define ASU2 ivec2
+ #define ASU3 ivec3
+ #define ASU4 ivec4
+//==============================================================================================================================
+ #define AF1_AU1(x) uintBitsToFloat(AU1(x))
+ #define AF2_AU2(x) uintBitsToFloat(AU2(x))
+ #define AF3_AU3(x) uintBitsToFloat(AU3(x))
+ #define AF4_AU4(x) uintBitsToFloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) floatBitsToUint(AF1(x))
+ #define AU2_AF2(x) floatBitsToUint(AF2(x))
+ #define AU3_AF3(x) floatBitsToUint(AF3(x))
+ #define AU4_AF4(x) floatBitsToUint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));}
+ #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AH2_AF2 packHalf2x16
+ #define AU1_AW2Unorm_AF2 packUnorm2x16
+ #define AU1_AB4Unorm_AF4 packUnorm4x8
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AF2_AH2_AU1 unpackHalf2x16
+ #define AF2_AW2Unorm_AU1 unpackUnorm2x16
+ #define AF4_AB4Unorm_AU1 unpackUnorm4x8
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate.
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MED3_F32.
+ AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);}
+ AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);}
+ AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);}
+ AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F32 (note DX frac() is different).
+ AF1 AFractF1(AF1 x){return fract(x);}
+ AF2 AFractF2(AF2 x){return fract(x);}
+ AF3 AFractF3(AF3 x){return fract(x);}
+ AF4 AFractF4(AF4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MAX3_F32.
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Clamp has an easier pattern match for med3 when some ordering is known.
+ // V_MED3_F32.
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_MIN3_F32.
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_COS_F32.
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently.
+ // V_SIN_F32.
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;}
+ AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;}
+ AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;}
+ AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);}
+ AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);}
+ AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);}
+ AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));}
+ AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));}
+ AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));}
+ AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+  #define AB1 uint8_t
+  #define AB2 u8vec2
+  #define AB3 u8vec3
+  #define AB4 u8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASB1 int8_t
+  #define ASB2 i8vec2
+  #define ASB3 i8vec3
+  #define ASB4 i8vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AB1 AB1_x(AB1 a){return AB1(a);}
+  AB2 AB2_x(AB1 a){return AB2(a,a);}
+  AB3 AB3_x(AB1 a){return AB3(a,a,a);}
+  AB4 AB4_x(AB1 a){return AB4(a,a,a,a);}
+  #define AB1_(a) AB1_x(AB1(a))
+  #define AB2_(a) AB2_x(AB1(a))
+  #define AB3_(a) AB3_x(AB1(a))
+  #define AB4_(a) AB4_x(AB1(a))
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          GLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #define AH1 float16_t
+  #define AH2 f16vec2
+  #define AH3 f16vec3
+  #define AH4 f16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AW1 uint16_t
+  #define AW2 u16vec2
+  #define AW3 u16vec3
+  #define AW4 u16vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASW1 int16_t
+  #define ASW2 i16vec2
+  #define ASW3 i16vec3
+  #define ASW4 i16vec4
+//==============================================================================================================================
+  #define AH2_AU1(x) unpackFloat2x16(AU1(x))
+  AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));}
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) unpackUint2x16(AU1(x))
+  #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x)))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1_AH2(x) packFloat2x16(AH2(x))
+  AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));}
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) packUint2x16(AW2(x))
+  #define AU2_AW4(x) unpack32(packUint4x16(AW4(x)))
+//==============================================================================================================================
+  #define AW1_AH1(x) halfBitsToUint16(AH1(x))
+  #define AW2_AH2(x) halfBitsToUint16(AH2(x))
+  #define AW3_AH3(x) halfBitsToUint16(AH3(x))
+  #define AW4_AH4(x) halfBitsToUint16(AH4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AH1_AW1(x) uint16BitsToHalf(AW1(x))
+  #define AH2_AW2(x) uint16BitsToHalf(AW2(x))
+  #define AH3_AW3(x) uint16BitsToHalf(AW3(x))
+  #define AH4_AW4(x) uint16BitsToHalf(AW4(x))
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);}
+  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);}
+  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);}
+  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFractH1(AH1 x){return fract(x);}
+  AH2 AFractH2(AH2 x){return fract(x);}
+  AH3 AFractH3(AH3 x){return fract(x);}
+  AH4 AFractH4(AH4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of max3.
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // No packed version of min3.
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;}
+  AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;}
+  AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;}
+  AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);}
+  AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);}
+  AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);}
+  AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));}
+  AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));}
+  AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));}
+  AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #define AD1 double
+  #define AD2 dvec2
+  #define AD3 dvec3
+  #define AD4 dvec4
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 x){return fract(x);}
+  AD2 AFractD2(AD2 x){return fract(x);}
+  AD3 AFractD3(AD3 x){return fract(x);}
+  AD4 AFractD4(AD4 x){return fract(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;}
+  AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;}
+  AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;}
+  AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);}
+  AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);}
+  AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);}
+  AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));}
+  AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));}
+  AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));}
+  AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         GLSL LONG
+//==============================================================================================================================
+ #ifdef A_LONG
+  #define AL1 uint64_t
+  #define AL2 u64vec2
+  #define AL3 u64vec3
+  #define AL4 u64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASL1 int64_t
+  #define ASL2 i64vec2
+  #define ASL3 i64vec3
+  #define ASL4 i64vec4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AL1_AU2(x) packUint2x32(AU2(x))
+  #define AU2_AL1(x) unpackUint2x32(AL1(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AL1_x(AL1 a){return AL1(a);}
+  AL2 AL2_x(AL1 a){return AL2(a,a);}
+  AL3 AL3_x(AL1 a){return AL3(a,a,a);}
+  AL4 AL4_x(AL1 a){return AL4(a,a,a,a);}
+  #define AL1_(a) AL1_x(AL1(a))
+  #define AL2_(a) AL2_x(AL1(a))
+  #define AL3_(a) AL3_x(AL1(a))
+  #define AL4_(a) AL4_x(AL1(a))
+//==============================================================================================================================
+  AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));}
+  AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));}
+  AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));}
+  AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));}
+  AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));}
+  AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));}
+  AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));}
+  AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));}
+  AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));}
+  AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      WAVE OPERATIONS
+//==============================================================================================================================
+ #ifdef A_WAVE
+  // Where 'x' must be a compile time literal.
+  AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);}
+  AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_HALF
+   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));}
+   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));}
+   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));}
+   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));}
+  #endif
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                            HLSL
+//
+//
+//==============================================================================================================================
+#if defined(A_HLSL) && defined(A_GPU)
+ #ifdef A_HLSL_6_2
+  #define AP1 bool
+  #define AP2 bool2
+  #define AP3 bool3
+  #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AF1 float32_t
+  #define AF2 float32_t2
+  #define AF3 float32_t3
+  #define AF4 float32_t4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1 uint32_t
+  #define AU2 uint32_t2
+  #define AU3 uint32_t3
+  #define AU4 uint32_t4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASU1 int32_t
+  #define ASU2 int32_t2
+  #define ASU3 int32_t3
+  #define ASU4 int32_t4
+ #else
+  #define AP1 bool
+  #define AP2 bool2
+  #define AP3 bool3
+  #define AP4 bool4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AF1 float
+  #define AF2 float2
+  #define AF3 float3
+  #define AF4 float4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define AU1 uint
+  #define AU2 uint2
+  #define AU3 uint3
+  #define AU4 uint4
+//------------------------------------------------------------------------------------------------------------------------------
+  #define ASU1 int
+  #define ASU2 int2
+  #define ASU3 int3
+  #define ASU4 int4
+ #endif
+//==============================================================================================================================
+ #define AF1_AU1(x) asfloat(AU1(x))
+ #define AF2_AU2(x) asfloat(AU2(x))
+ #define AF3_AU3(x) asfloat(AU3(x))
+ #define AF4_AU4(x) asfloat(AU4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AU1_AF1(x) asuint(AF1(x))
+ #define AU2_AF2(x) asuint(AF2(x))
+ #define AU3_AF3(x) asuint(AF3(x))
+ #define AU4_AF4(x) asuint(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);}
+ #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);}
+ #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) 
+ #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x))
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));}
+ #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x))
+//==============================================================================================================================
+ AF1 AF1_x(AF1 a){return AF1(a);}
+ AF2 AF2_x(AF1 a){return AF2(a,a);}
+ AF3 AF3_x(AF1 a){return AF3(a,a,a);}
+ AF4 AF4_x(AF1 a){return AF4(a,a,a,a);}
+ #define AF1_(a) AF1_x(AF1(a))
+ #define AF2_(a) AF2_x(AF1(a))
+ #define AF3_(a) AF3_x(AF1(a))
+ #define AF4_(a) AF4_x(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AU1_x(AU1 a){return AU1(a);}
+ AU2 AU2_x(AU1 a){return AU2(a,a);}
+ AU3 AU3_x(AU1 a){return AU3(a,a,a);}
+ AU4 AU4_x(AU1 a){return AU4(a,a,a,a);}
+ #define AU1_(a) AU1_x(AU1(a))
+ #define AU2_(a) AU2_x(AU1(a))
+ #define AU3_(a) AU3_x(AU1(a))
+ #define AU4_(a) AU4_x(AU1(a))
+//==============================================================================================================================
+ AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));}
+ AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));}
+ AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));}
+ AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;}
+ AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));}
+ AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));}
+ AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));}
+ AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));}
+ AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AFractF1(AF1 x){return x-floor(x);}
+ AF2 AFractF2(AF2 x){return x-floor(x);}
+ AF3 AFractF3(AF3 x){return x-floor(x);}
+ AF4 AFractF4(AF4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);}
+ AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);}
+ AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);}
+ AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));}
+ AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));}
+ AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));}
+ AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));}
+ AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));}
+ AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));}
+ AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));}
+ AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));}
+ AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));}
+ AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));}
+ AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));}
+ AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));}
+ AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));}
+ AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));}
+ AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));}
+ AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));}
+ AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));}
+ AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));}
+ AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));}
+ AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));}
+ AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));}
+ AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));}
+ AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));}
+ AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));}
+ AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));}
+ AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));}
+ AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));}
+ AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));}
+ AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));}
+ AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));}
+ AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));}
+ AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));}
+ AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));}
+ AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARcpF1(AF1 x){return rcp(x);}
+ AF2 ARcpF2(AF2 x){return rcp(x);}
+ AF3 ARcpF3(AF3 x){return rcp(x);}
+ AF4 ARcpF4(AF4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ARsqF1(AF1 x){return rsqrt(x);}
+ AF2 ARsqF2(AF2 x){return rsqrt(x);}
+ AF3 ARsqF3(AF3 x){return rsqrt(x);}
+ AF4 ARsqF4(AF4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 ASatF1(AF1 x){return saturate(x);}
+ AF2 ASatF2(AF2 x){return saturate(x);}
+ AF3 ASatF3(AF3 x){return saturate(x);}
+ AF4 ASatF4(AF4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));}
+ AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));}
+ AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));}
+ AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL BYTE
+//==============================================================================================================================
+ #ifdef A_BYTE
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          HLSL HALF
+//==============================================================================================================================
+ #ifdef A_HALF
+  #ifdef A_HLSL_6_2
+   #define AH1 float16_t
+   #define AH2 float16_t2
+   #define AH3 float16_t3
+   #define AH4 float16_t4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define AW1 uint16_t
+   #define AW2 uint16_t2
+   #define AW3 uint16_t3
+   #define AW4 uint16_t4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define ASW1 int16_t
+   #define ASW2 int16_t2
+   #define ASW3 int16_t3
+   #define ASW4 int16_t4
+  #else
+   #define AH1 min16float
+   #define AH2 min16float2
+   #define AH3 min16float3
+   #define AH4 min16float4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define AW1 min16uint
+   #define AW2 min16uint2
+   #define AW3 min16uint3
+   #define AW4 min16uint4
+//------------------------------------------------------------------------------------------------------------------------------
+   #define ASW1 min16int
+   #define ASW2 min16int2
+   #define ASW3 min16int3
+   #define ASW4 min16int4
+  #endif
+//==============================================================================================================================
+  // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly).
+  // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/
+  AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);}
+  AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));}
+  AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);}
+  AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));}
+  #define AH2_AU1(x) AH2_AU1_x(AU1(x))
+  #define AH4_AU2(x) AH4_AU2_x(AU2(x))
+  #define AW2_AU1(x) AW2_AU1_x(AU1(x))
+  #define AW4_AU2(x) AW4_AU2_x(AU2(x))
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);}
+  AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));}
+  AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);}
+  AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));}
+  #define AU1_AH2(x) AU1_AH2_x(AH2(x))
+  #define AU2_AH4(x) AU2_AH4_x(AH4(x))
+  #define AU1_AW2(x) AU1_AW2_x(AW2(x))
+  #define AU2_AW4(x) AU2_AW4_x(AW4(x))
+//==============================================================================================================================
+  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
+   #define AW1_AH1(x) asuint16(x)
+   #define AW2_AH2(x) asuint16(x)
+   #define AW3_AH3(x) asuint16(x)
+   #define AW4_AH4(x) asuint16(x)
+  #else
+   #define AW1_AH1(a) AW1(f32tof16(AF1(a)))
+   #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y))
+   #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z))
+   #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w))
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST)
+   #define AH1_AW1(x) asfloat16(x)
+   #define AH2_AW2(x) asfloat16(x)
+   #define AH3_AW3(x) asfloat16(x)
+   #define AH4_AW4(x) asfloat16(x)
+  #else
+   #define AH1_AW1(a) AH1(f16tof32(AU1(a)))
+   #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y))
+   #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z))
+   #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w))
+  #endif
+//==============================================================================================================================
+  AH1 AH1_x(AH1 a){return AH1(a);}
+  AH2 AH2_x(AH1 a){return AH2(a,a);}
+  AH3 AH3_x(AH1 a){return AH3(a,a,a);}
+  AH4 AH4_x(AH1 a){return AH4(a,a,a,a);}
+  #define AH1_(a) AH1_x(AH1(a))
+  #define AH2_(a) AH2_x(AH1(a))
+  #define AH3_(a) AH3_x(AH1(a))
+  #define AH4_(a) AH4_x(AH1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AW1_x(AW1 a){return AW1(a);}
+  AW2 AW2_x(AW1 a){return AW2(a,a);}
+  AW3 AW3_x(AW1 a){return AW3(a,a,a);}
+  AW4 AW4_x(AW1 a){return AW4(a,a,a,a);}
+  #define AW1_(a) AW1_x(AW1(a))
+  #define AW2_(a) AW2_x(AW1(a))
+  #define AW3_(a) AW3_x(AW1(a))
+  #define AW4_(a) AW4_x(AW1(a))
+//==============================================================================================================================
+  AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));}
+  AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));}
+  AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));}
+  AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));}
+  AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));}
+  AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));}
+  AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // V_FRACT_F16 (note DX frac() is different).
+  AH1 AFractH1(AH1 x){return x-floor(x);}
+  AH2 AFractH2(AH2 x){return x-floor(x);}
+  AH3 AFractH3(AH3 x){return x-floor(x);}
+  AH4 AFractH4(AH4 x){return x-floor(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);}
+  AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);}
+  AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);}
+  AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));}
+  AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));}
+  AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));}
+  AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));}
+  AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));}
+  AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));}
+  AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));}
+  AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));}
+  AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));}
+  AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));}
+  AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));}
+  AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));}
+  AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARcpH1(AH1 x){return rcp(x);}
+  AH2 ARcpH2(AH2 x){return rcp(x);}
+  AH3 ARcpH3(AH3 x){return rcp(x);}
+  AH4 ARcpH4(AH4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ARsqH1(AH1 x){return rsqrt(x);}
+  AH2 ARsqH2(AH2 x){return rsqrt(x);}
+  AH3 ARsqH3(AH3 x){return rsqrt(x);}
+  AH4 ARsqH4(AH4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASatH1(AH1 x){return saturate(x);}
+  AH2 ASatH2(AH2 x){return saturate(x);}
+  AH3 ASatH3(AH3 x){return saturate(x);}
+  AH4 ASatH4(AH4 x){return saturate(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));}
+  AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));}
+  AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));}
+  AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         HLSL DOUBLE
+//==============================================================================================================================
+ #ifdef A_DUBL
+  #ifdef A_HLSL_6_2
+   #define AD1 float64_t
+   #define AD2 float64_t2
+   #define AD3 float64_t3
+   #define AD4 float64_t4
+  #else
+   #define AD1 double
+   #define AD2 double2
+   #define AD3 double3
+   #define AD4 double4
+  #endif
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 AD1_x(AD1 a){return AD1(a);}
+  AD2 AD2_x(AD1 a){return AD2(a,a);}
+  AD3 AD3_x(AD1 a){return AD3(a,a,a);}
+  AD4 AD4_x(AD1 a){return AD4(a,a,a,a);}
+  #define AD1_(a) AD1_x(AD1(a))
+  #define AD2_(a) AD2_x(AD1(a))
+  #define AD3_(a) AD3_x(AD1(a))
+  #define AD4_(a) AD4_x(AD1(a))
+//==============================================================================================================================
+  AD1 AFractD1(AD1 a){return a-floor(a);}
+  AD2 AFractD2(AD2 a){return a-floor(a);}
+  AD3 AFractD3(AD3 a){return a-floor(a);}
+  AD4 AFractD4(AD4 a){return a-floor(a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);}
+  AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);}
+  AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);}
+  AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARcpD1(AD1 x){return rcp(x);}
+  AD2 ARcpD2(AD2 x){return rcp(x);}
+  AD3 ARcpD3(AD3 x){return rcp(x);}
+  AD4 ARcpD4(AD4 x){return rcp(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ARsqD1(AD1 x){return rsqrt(x);}
+  AD2 ARsqD2(AD2 x){return rsqrt(x);}
+  AD3 ARsqD3(AD3 x){return rsqrt(x);}
+  AD4 ARsqD4(AD4 x){return rsqrt(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD1 ASatD1(AD1 x){return saturate(x);}
+  AD2 ASatD2(AD2 x){return saturate(x);}
+  AD3 ASatD3(AD3 x){return saturate(x);}
+  AD4 ASatD4(AD4 x){return saturate(x);}
+ #endif
+//==============================================================================================================================
+//                                                         HLSL WAVE
+//==============================================================================================================================
+ #ifdef A_WAVE
+  // Where 'x' must be a compile time literal.
+  AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+  AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  #ifdef A_HALF
+   AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));}
+   AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));}
+   AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));}
+   AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));}
+  #endif
+ #endif
+//==============================================================================================================================
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                          GPU COMMON
+//
+//
+//==============================================================================================================================
+#ifdef A_GPU
+ // Negative and positive infinity.
+ #define A_INFP_F AF1_AU1(0x7f800000u)
+ #define A_INFN_F AF1_AU1(0xff800000u)
+//------------------------------------------------------------------------------------------------------------------------------
+ // Copy sign from 's' to positive 'd'.
+ AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));}
+ AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));}
+ AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));}
+ AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Single operation to return (useful to create a mask to use in lerp for branch free logic),
+ //  m=NaN := 0
+ //  m>=0  := 0
+ //  m<0   := 1
+ // Uses the following useful floating point logic,
+ //  saturate(+a*(-INF)==-INF) := 0
+ //  saturate( 0*(-INF)== NaN) := 0
+ //  saturate(-a*(-INF)==+INF) := 1
+ AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));}
+ AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));}
+ AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));}
+ AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));}
+ AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));}
+ AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));}
+ AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  #ifdef A_HLSL_6_2
+   #define A_INFP_H AH1_AW1((uint16_t)0x7c00u)
+   #define A_INFN_H AH1_AW1((uint16_t)0xfc00u)
+  #else
+   #define A_INFP_H AH1_AW1(0x7c00u)
+   #define A_INFN_H AH1_AW1(0xfc00u)
+  #endif
+
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));}
+  AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));}
+  AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));}
+  AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));}
+  AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));}
+  AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));}
+  AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));}
+  AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));}
+  AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));}
+  AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                [FIS] FLOAT INTEGER SORTABLE
+//------------------------------------------------------------------------------------------------------------------------------
+// Float to integer sortable.
+//  - If sign bit=0, flip the sign bit (positives).
+//  - If sign bit=1, flip all bits     (negatives).
+// Integer sortable to float.
+//  - If sign bit=1, flip the sign bit (positives).
+//  - If sign bit=0, flip all bits     (negatives).
+// Has nice side effects.
+//  - Larger integers are more positive values.
+//  - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage).
+// Burns 3 ops for conversion {shift,or,xor}.
+//==============================================================================================================================
+ AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
+ AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value).
+ AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
+ AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));}
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
+  AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
+  AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      [PERM] V_PERM_B32
+//------------------------------------------------------------------------------------------------------------------------------
+// Support for V_PERM_B32 started in the 3rd generation of GCN.
+//------------------------------------------------------------------------------------------------------------------------------
+// yyyyxxxx - The 'i' input.
+// 76543210
+// ========
+// HGFEDCBA - Naming on permutation.
+//------------------------------------------------------------------------------------------------------------------------------
+// TODO
+// ====
+//  - Make sure compiler optimizes this.
+//==============================================================================================================================
+ #ifdef A_HALF
+  AU1 APerm0E0A(AU2 i){return((i.x    )&0xffu)|((i.y<<16)&0xff0000u);}
+  AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);}
+  AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y    )&0xff0000u);}
+  AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 APermHGFA(AU2 i){return((i.x    )&0x000000ffu)|(i.y&0xffffff00u);}
+  AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);}
+  AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
+  AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);}
+  AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);}
+  AU1 APermHCFE(AU2 i){return((i.x    )&0x00ff0000u)|(i.y&0xff00ffffu);}
+  AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);}
+  AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);}
+  AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               [BUC] BYTE UNSIGNED CONVERSION
+//------------------------------------------------------------------------------------------------------------------------------
+// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation.
+// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively.
+//------------------------------------------------------------------------------------------------------------------------------
+// OPCODE NOTES
+// ============
+// GCN does not do UNORM or SNORM for bytes in opcodes.
+//  - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float.
+//  - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer).
+// V_PERM_B32 does byte packing with ability to zero fill bytes as well.
+//  - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. 
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops.
+// ====   =====
+//    0 : 0
+//    1 : 1
+//     ...
+//  255 : 255
+//      : 256 (just outside the encoding range)
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
+// ====   =====
+//    0 : 0
+//    1 : 1/512
+//    2 : 1/256
+//     ...
+//   64 : 1/8
+//  128 : 1/4
+//  255 : 255/512
+//      : 1/2 (just outside the encoding range)
+//------------------------------------------------------------------------------------------------------------------------------
+// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES
+// ============================================
+// r=ABuc0FromU1(i)
+//   V_CVT_F32_UBYTE0 r,i
+// --------------------------------------------
+// r=ABuc0ToU1(d,i)
+//   V_CVT_PKACCUM_U8_F32 r,i,0,d
+// --------------------------------------------
+// d=ABuc0FromU2(i)
+//   Where 'k0' is an SGPR with 0x0E0A
+//   Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits
+//   V_PERM_B32 d,i.x,i.y,k0
+//   V_PK_FMA_F16 d,d,k1.x,0
+// --------------------------------------------
+// r=ABuc0ToU2(d,i)
+//   Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits
+//   Where 'k1' is an SGPR with 0x????
+//   Where 'k2' is an SGPR with 0x????
+//   V_PK_FMA_F16 i,i,k0.x,0
+//   V_PERM_B32 r.x,i,i,k1
+//   V_PERM_B32 r.y,i,i,k2
+//==============================================================================================================================
+ // Peak range for 32-bit and 16-bit operations.
+ #define A_BUC_32 (255.0)
+ #define A_BUC_16 (255.0/512.0)
+//==============================================================================================================================
+ #if 1
+  // Designed to be one V_CVT_PKACCUM_U8_F32.
+  // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32.
+  AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u)    )&(0x000000ffu));}
+  AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));}
+  AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));}
+  AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed to be one V_CVT_F32_UBYTE*.
+  AF1 ABuc0FromU1(AU1 i){return AF1((i    )&255u);}
+  AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);}
+  AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);}
+  AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
+  AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0);
+   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed for 3 ops to do SOA to AOS and conversion.
+  AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)));
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Designed for 2 ops to do both AOS to SOA, and conversion.
+  AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);}
+  AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);}
+  AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);}
+  AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                 [BSC] BYTE SIGNED CONVERSION
+//------------------------------------------------------------------------------------------------------------------------------
+// Similar to [BUC].
+// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively.
+//------------------------------------------------------------------------------------------------------------------------------
+// ENCODING (without zero-based encoding)
+// ========
+//   0 = unused (can be used to mean something else)
+//   1 = lowest value 
+// 128 = exact zero center (zero based encoding 
+// 255 = highest value
+//------------------------------------------------------------------------------------------------------------------------------
+// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero).
+// This is useful if there is a desire for cleared values to decode as zero.
+//------------------------------------------------------------------------------------------------------------------------------
+// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32.
+// ====   =====
+//    0 : -127/512 (unused)
+//    1 : -126/512
+//    2 : -125/512
+//     ...
+//  128 : 0 
+//     ... 
+//  255 : 127/512
+//      : 1/4 (just outside the encoding range)
+//==============================================================================================================================
+ // Peak range for 32-bit and 16-bit operations.
+ #define A_BSC_32 (127.0)
+ #define A_BSC_16 (127.0/512.0)
+//==============================================================================================================================
+ #if 1
+  AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u)    )&(0x000000ffu));}
+  AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));}
+  AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));}
+  AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u)    )&(0x000000ffu)))^0x00000080u;}
+  AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;}
+  AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;}
+  AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ABsc0FromU1(AU1 i){return AF1((i    )&255u)-128.0;}
+  AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;}
+  AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;}
+  AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ABsc0FromZbU1(AU1 i){return AF1(((i    )&255u)^0x80u)-128.0;}
+  AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;}
+  AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;}
+  AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}.
+  AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);
+   return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)));
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));}
+  AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));}
+  AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));}
+  AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u;
+   return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+  AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     HALF APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These support only positive inputs.
+// Did not see value yet in specialization for range.
+// Using quick testing, ended up mostly getting the same "best" approximation for various ranges.
+// With hardware that can co-execute transcendentals, the value in approximations could be less than expected.
+// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total.
+// And co-execution would require a compiler interleaving a lot of independent work for packed usage.
+//------------------------------------------------------------------------------------------------------------------------------
+// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total).
+// Same with sqrt(), as this could be x*rsq() (7 ops).
+//==============================================================================================================================
+ #ifdef A_HALF
+  // Minimize squared error across full positive range, 2 ops.
+  // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output.
+  AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));}
+  AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));}
+  AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));}
+  AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Lower precision estimation, 1 op.
+  // Minimize squared error across {smallest normal to 16384.0}.
+  AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));}
+  AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));}
+  AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));}
+  AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Medium precision estimation, one Newton Raphson iteration, 3 ops.
+  AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));}
+  AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));}
+  AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));}
+  AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Minimize squared error across {smallest normal to 16384.0}, 2 ops.
+  AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));}
+  AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));}
+  AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));}
+  AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    FLOAT APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN",
+//  - Idea dates back to SGI, then to Quake 3, etc.
+//  - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf
+//     - sqrt(x)=rsqrt(x)*x
+//     - rcp(x)=rsqrt(x)*rsqrt(x) for positive x
+//  - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h
+//------------------------------------------------------------------------------------------------------------------------------
+// These below are from perhaps less complete searching for optimal.
+// Used FP16 normal range for testing with +4096 32-bit step size for sampling error.
+// So these match up well with the half approximations.
+//==============================================================================================================================
+ AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));}
+ AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));}
+ AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));}
+ AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));}
+ AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));}
+ AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));}
+ AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));}
+ AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));}
+ AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));}
+ AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));}
+ AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));}
+ AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));}
+ AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PQ APPROXIMATIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do
+// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%.
+//==============================================================================================================================
+// Helpers
+ AF1 Quart(AF1 a) { a = a * a; return a * a;}
+ AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; }
+ AF2 Quart(AF2 a) { a = a * a; return a * a; }
+ AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; }
+ AF3 Quart(AF3 a) { a = a * a; return a * a; }
+ AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; }
+ AF4 Quart(AF4 a) { a = a * a; return a * a; }
+ AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF1 APrxPQToGamma2(AF1 a) { return Quart(a); }
+ AF1 APrxPQToLinear(AF1 a) { return Oct(a); }
+ AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); }
+ AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); }
+ AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); }
+ AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF2 APrxPQToGamma2(AF2 a) { return Quart(a); }
+ AF2 APrxPQToLinear(AF2 a) { return Oct(a); }
+ AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); }
+ AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); }
+ AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); }
+ AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF3 APrxPQToGamma2(AF3 a) { return Quart(a); }
+ AF3 APrxPQToLinear(AF3 a) { return Oct(a); }
+ AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); }
+ AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); }
+ AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); }
+ AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); }
+ //------------------------------------------------------------------------------------------------------------------------------
+ AF4 APrxPQToGamma2(AF4 a) { return Quart(a); }
+ AF4 APrxPQToLinear(AF4 a) { return Oct(a); }
+ AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); }
+ AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); }
+ AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); }
+ AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); }
+ AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); }
+ AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); }
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PARABOLIC SIN & COS
+//------------------------------------------------------------------------------------------------------------------------------
+// Approximate answers to transcendental questions.
+//------------------------------------------------------------------------------------------------------------------------------
+//==============================================================================================================================
+ #if 1
+  // Valid input range is {-1 to 1} representing {0 to 2 pi}.
+  // Output range is {-1/4 to 1/4} representing {-1 to 1}.
+  AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD.
+  AF2 APSinF2(AF2 x){return x*abs(x)-x;}
+  AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT
+  AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);}
+  AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));}
+ #endif
+//------------------------------------------------------------------------------------------------------------------------------
+ #ifdef A_HALF
+  // For a packed {sin,cos} pair,
+  //  - Native takes 16 clocks and 4 issue slots (no packed transcendentals).
+  //  - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed).
+  AH1 APSinH1(AH1 x){return x*abs(x)-x;}
+  AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA
+  AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} 
+  AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND
+  AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     [ZOL] ZERO ONE LOGIC
+//------------------------------------------------------------------------------------------------------------------------------
+// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit.
+//------------------------------------------------------------------------------------------------------------------------------
+// 0 := false
+// 1 := true
+//------------------------------------------------------------------------------------------------------------------------------
+// AndNot(x,y)   -> !(x&y) .... One op.
+// AndOr(x,y,z)  -> (x&y)|z ... One op.
+// GtZero(x)     -> x>0.0 ..... One op.
+// Sel(x,y,z)    -> x?y:z ..... Two ops, has no precision loss.
+// Signed(x)     -> x<0.0 ..... One op.
+// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer.
+//------------------------------------------------------------------------------------------------------------------------------
+// OPTIMIZATION NOTES
+// ==================
+// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'.
+//   For example 'a.xy*k.xx+k.yy'.
+//==============================================================================================================================
+ #if 1
+  AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);}
+  AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);}
+  AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);}
+  AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AZolNotU1(AU1 x){return x^AU1_(1);}
+  AU2 AZolNotU2(AU2 x){return x^AU2_(1);}
+  AU3 AZolNotU3(AU3 x){return x^AU3_(1);}
+  AU4 AZolNotU4(AU4 x){return x^AU4_(1);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);}
+  AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);}
+  AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);}
+  AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);}
+//==============================================================================================================================
+  AU1 AZolF1ToU1(AF1 x){return AU1(x);}
+  AU2 AZolF2ToU2(AF2 x){return AU2(x);}
+  AU3 AZolF3ToU3(AF3 x){return AU3(x);}
+  AU4 AZolF4ToU4(AF4 x){return AU4(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled).
+  AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);}
+  AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);}
+  AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);}
+  AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolU1ToF1(AU1 x){return AF1(x);}
+  AF2 AZolU2ToF2(AU2 x){return AF2(x);}
+  AF3 AZolU3ToF3(AU3 x){return AF3(x);}
+  AF4 AZolU4ToF4(AU4 x){return AF4(x);}
+//==============================================================================================================================
+  AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);}
+  AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);}
+  AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);}
+  AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);}
+  AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);}
+  AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);}
+  AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);}
+  AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);}
+  AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);}
+  AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));}
+  AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));}
+  AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));}
+  AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;}
+  AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;}
+  AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;}
+  AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);}
+  AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);}
+  AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);}
+  AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;}
+  AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;}
+  AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;}
+  AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));}
+  AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));}
+  AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));}
+  AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));}
+  AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));}
+  AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));}
+  AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);}
+  AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);}
+  AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);}
+  AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AZolNotW1(AW1 x){return x^AW1_(1);}
+  AW2 AZolNotW2(AW2 x){return x^AW2_(1);}
+  AW3 AZolNotW3(AW3 x){return x^AW3_(1);}
+  AW4 AZolNotW4(AW4 x){return x^AW4_(1);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);}
+  AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);}
+  AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);}
+  AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);}
+//==============================================================================================================================
+  // Uses denormal trick.
+  AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));}
+  AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));}
+  AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));}
+  AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // AMD arch lacks a packed conversion opcode.
+  AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));}
+  AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));}
+  AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));}
+  AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));}
+//==============================================================================================================================
+  AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);}
+  AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);}
+  AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);}
+  AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);}
+  AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);}
+  AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);}
+  AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);}
+  AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);}
+  AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);}
+  AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));}
+  AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));}
+  AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));}
+  AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;}
+  AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;}
+  AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;}
+  AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);}
+  AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);}
+  AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);}
+  AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;}
+  AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;}
+  AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;}
+  AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));}
+  AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));}
+  AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));}
+  AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      COLOR CONVERSIONS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are all linear to/from some other space (where 'linear' has been shortened out of the function name).
+// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'.
+// These are branch free implementations.
+// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion.
+//------------------------------------------------------------------------------------------------------------------------------
+// TRANSFER FUNCTIONS
+// ==================
+// 709 ..... Rec709 used for some HDTVs
+// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native
+// Pq ...... PQ native for HDR10
+// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type
+// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations)
+// Three ... Gamma 3.0, less fast, but good for HDR.
+//------------------------------------------------------------------------------------------------------------------------------
+// KEEPING TO SPEC
+// ===============
+// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times.
+//  (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range).
+//  (b.) For 8-bit  709, steps {0 to 20.7} are in the linear region (8% of the encoding range).
+// Also there is a slight step in the transition regions.
+// Precision of the coefficients in the spec being the likely cause.
+// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store.
+// This is to work around lack of hardware (typically only ROP does the conversion for free).
+// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free).
+// So this header keeps with the spec.
+// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear.
+// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear.
+//------------------------------------------------------------------------------------------------------------------------------
+// FOR PQ
+// ======
+// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2.
+// All constants are only specified to FP32 precision.
+// External PQ source reference,
+//  - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl
+//------------------------------------------------------------------------------------------------------------------------------
+// PACKED VERSIONS
+// ===============
+// These are the A*H2() functions.
+// There is no PQ functions as FP16 seemed to not have enough precision for the conversion.
+// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors.
+// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least).
+//------------------------------------------------------------------------------------------------------------------------------
+// NOTES
+// =====
+// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case.
+//==============================================================================================================================
+ #if 1
+  AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma().
+  AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} 
+  AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} 
+  AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302));
+   return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));}
+  AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302));
+   return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));}
+  AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302));
+   return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToTwoF1(AF1 c){return sqrt(c);}
+  AF2 AToTwoF2(AF2 c){return sqrt(c);}
+  AF3 AToTwoF3(AF3 c){return sqrt(c);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));}
+  AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));}
+  AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));}
+ #endif
+//==============================================================================================================================
+ #if 1
+  // Unfortunately median won't work here.
+  AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099);
+   return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} 
+  AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} 
+  AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} 
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833));
+   return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));}
+  AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833));
+   return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));}
+  AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833));
+   return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));}
+//------------------------------------------------------------------------------------------------------------------------------
+  // Unfortunately median won't work here.
+  AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF1(AZolSignedF1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055);
+   return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromTwoF1(AF1 c){return c*c;}
+  AF2 AFromTwoF2(AF2 c){return c*c;}
+  AF3 AFromTwoF3(AF3 c){return c*c;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AF1 AFromThreeF1(AF1 c){return c*c*c;}
+  AF2 AFromThreeF2(AF2 c){return c*c*c;}
+  AF3 AFromThreeF3(AF3 c){return c*c*c;}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));}
+  AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));}
+  AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.x  ,c*j.y  ,pow(c,j.z  )*k.x  +k.y  );}
+  AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );}
+  AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055);
+   return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToTwoH1(AH1 c){return sqrt(c);}
+  AH2 AToTwoH2(AH2 c){return sqrt(c);}
+  AH3 AToTwoH3(AH3 c){return sqrt(c);}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));}
+  AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));}
+  AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));}
+ #endif
+//==============================================================================================================================
+ #ifdef A_HALF
+  AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099);
+   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));}
+  AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));}
+  AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH1(AZolSignedH1(c-j.x  ),c*j.y  ,pow(c*k.x  +k.y  ,j.z  ));}
+  AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));}
+  AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055);
+   return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromTwoH1(AH1 c){return c*c;}
+  AH2 AFromTwoH2(AH2 c){return c*c;}
+  AH3 AFromTwoH3(AH3 c){return c*c;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AH1 AFromThreeH1(AH1 c){return c*c*c;}
+  AH2 AFromThreeH2(AH2 c){return c*c*c;}
+  AH3 AFromThreeH3(AH3 c){return c*c*c;}
+ #endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                          CS REMAP
+//==============================================================================================================================
+ // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear.
+ //  543210
+ //  ======
+ //  ..xxx.
+ //  yy...y
+ AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+//==============================================================================================================================
+ // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions.
+ //  543210
+ //  ======
+ //  .xx..x
+ //  y..yy.
+ // Details,
+ //  LANE TO 8x8 MAPPING
+ //  ===================
+ //  00 01 08 09 10 11 18 19 
+ //  02 03 0a 0b 12 13 1a 1b
+ //  04 05 0c 0d 14 15 1c 1d
+ //  06 07 0e 0f 16 17 1e 1f 
+ //  20 21 28 29 30 31 38 39 
+ //  22 23 2a 2b 32 33 3a 3b
+ //  24 25 2c 2d 34 35 3c 3d
+ //  26 27 2e 2f 36 37 3e 3f 
+ AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+//==============================================================================================================================
+ #ifdef A_HALF
+  AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));}
+  AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));}
+ #endif
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                                          REFERENCE
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// IEEE FLOAT RULES
+// ================
+//  - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1
+//  - {+/-}0 * {+/-}INF = NaN
+//  - -INF + (+INF) = NaN
+//  - {+/-}0 / {+/-}0 = NaN
+//  - {+/-}INF / {+/-}INF = NaN
+//  - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN)
+//  - 0 == -0
+//  - 4/0 = +INF
+//  - 4/-0 = -INF
+//  - 4+INF = +INF
+//  - 4-INF = -INF
+//  - 4*(+INF) = +INF
+//  - 4*(-INF) = -INF
+//  - -4*(+INF) = -INF
+//  - sqrt(+INF) = +INF
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16 ENCODING
+// =============
+// fedcba9876543210
+// ----------------
+// ......mmmmmmmmmm  10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals)
+// .eeeee..........  5-bit exponent
+// .00000..........  denormals
+// .00001..........  -14 exponent
+// .11110..........   15 exponent
+// .111110000000000  infinity
+// .11111nnnnnnnnnn  NaN with n!=0
+// s...............  sign
+//------------------------------------------------------------------------------------------------------------------------------
+// FP16/INT16 ALIASING DENORMAL
+// ============================
+// 11-bit unsigned integers alias with half float denormal/normal values,
+//     1 = 2^(-24) = 1/16777216 ....................... first denormal value
+//     2 = 2^(-23)
+//   ...
+//  1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value
+//  1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers
+//  2047 .............................................. last normal value that still maps to integers 
+// Scaling limits,
+//  2^15 = 32768 ...................................... largest power of 2 scaling
+// Largest pow2 conversion mapping is at *32768,
+//     1 : 2^(-9) = 1/512
+//     2 : 1/256
+//     4 : 1/128
+//     8 : 1/64
+//    16 : 1/32
+//    32 : 1/16
+//    64 : 1/8
+//   128 : 1/4
+//   256 : 1/2
+//   512 : 1
+//  1024 : 2
+//  2047 : a little less than 4
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                                                     GPU/CPU PORTABILITY
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This is the GPU implementation.
+// See the CPU implementation for docs.
+//==============================================================================================================================
+#ifdef A_GPU
+ #define A_TRUE true
+ #define A_FALSE false
+ #define A_STATIC
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                     VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY
+//==============================================================================================================================
+ #define retAD2 AD2
+ #define retAD3 AD3
+ #define retAD4 AD4
+ #define retAF2 AF2
+ #define retAF3 AF3
+ #define retAF4 AF4
+ #define retAL2 AL2
+ #define retAL3 AL3
+ #define retAL4 AL4
+ #define retAU2 AU2
+ #define retAU3 AU3
+ #define retAU4 AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inAD2 in AD2
+ #define inAD3 in AD3
+ #define inAD4 in AD4
+ #define inAF2 in AF2
+ #define inAF3 in AF3
+ #define inAF4 in AF4
+ #define inAL2 in AL2
+ #define inAL3 in AL3
+ #define inAL4 in AL4
+ #define inAU2 in AU2
+ #define inAU3 in AU3
+ #define inAU4 in AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define inoutAD2 inout AD2
+ #define inoutAD3 inout AD3
+ #define inoutAD4 inout AD4
+ #define inoutAF2 inout AF2
+ #define inoutAF3 inout AF3
+ #define inoutAF4 inout AF4
+ #define inoutAL2 inout AL2
+ #define inoutAL3 inout AL3
+ #define inoutAL4 inout AL4
+ #define inoutAU2 inout AU2
+ #define inoutAU3 inout AU3
+ #define inoutAU4 inout AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define outAD2 out AD2
+ #define outAD3 out AD3
+ #define outAD4 out AD4
+ #define outAF2 out AF2
+ #define outAF3 out AF3
+ #define outAF4 out AF4
+ #define outAL2 out AL2
+ #define outAL3 out AL3
+ #define outAL4 out AL4
+ #define outAU2 out AU2
+ #define outAU3 out AU3
+ #define outAU4 out AU4
+//------------------------------------------------------------------------------------------------------------------------------
+ #define varAD2(x) AD2 x
+ #define varAD3(x) AD3 x
+ #define varAD4(x) AD4 x
+ #define varAF2(x) AF2 x
+ #define varAF3(x) AF3 x
+ #define varAF4(x) AF4 x
+ #define varAL2(x) AL2 x
+ #define varAL3(x) AL3 x
+ #define varAL4(x) AL4 x
+ #define varAU2(x) AU2 x
+ #define varAU3(x) AU3 x
+ #define varAU4(x) AU4 x
+//------------------------------------------------------------------------------------------------------------------------------
+ #define initAD2(x,y) AD2(x,y)
+ #define initAD3(x,y,z) AD3(x,y,z)
+ #define initAD4(x,y,z,w) AD4(x,y,z,w)
+ #define initAF2(x,y) AF2(x,y)
+ #define initAF3(x,y,z) AF3(x,y,z)
+ #define initAF4(x,y,z,w) AF4(x,y,z,w)
+ #define initAL2(x,y) AL2(x,y)
+ #define initAL3(x,y,z) AL3(x,y,z)
+ #define initAL4(x,y,z,w) AL4(x,y,z,w)
+ #define initAU2(x,y) AU2(x,y)
+ #define initAU3(x,y,z) AU3(x,y,z)
+ #define initAU4(x,y,z,w) AU4(x,y,z,w)
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     SCALAR RETURN OPS
+//==============================================================================================================================
+ #define AAbsD1(a) abs(AD1(a))
+ #define AAbsF1(a) abs(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ACosD1(a) cos(AD1(a))
+ #define ACosF1(a) cos(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ADotD2(a,b) dot(AD2(a),AD2(b))
+ #define ADotD3(a,b) dot(AD3(a),AD3(b))
+ #define ADotD4(a,b) dot(AD4(a),AD4(b))
+ #define ADotF2(a,b) dot(AF2(a),AF2(b))
+ #define ADotF3(a,b) dot(AF3(a),AF3(b))
+ #define ADotF4(a,b) dot(AF4(a),AF4(b))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AExp2D1(a) exp2(AD1(a))
+ #define AExp2F1(a) exp2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AFloorD1(a) floor(AD1(a))
+ #define AFloorF1(a) floor(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ALog2D1(a) log2(AD1(a))
+ #define ALog2F1(a) log2(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMaxD1(a,b) max(a,b)
+ #define AMaxF1(a,b) max(a,b)
+ #define AMaxL1(a,b) max(a,b)
+ #define AMaxU1(a,b) max(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define AMinD1(a,b) min(a,b)
+ #define AMinF1(a,b) min(a,b)
+ #define AMinL1(a,b) min(a,b)
+ #define AMinU1(a,b) min(a,b)
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASinD1(a) sin(AD1(a))
+ #define ASinF1(a) sin(AF1(a))
+//------------------------------------------------------------------------------------------------------------------------------
+ #define ASqrtD1(a) sqrt(AD1(a))
+ #define ASqrtF1(a) sqrt(AF1(a))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                               SCALAR RETURN OPS - DEPENDENT
+//==============================================================================================================================
+ #define APowD1(a,b) pow(AD1(a),AF1(b))
+ #define APowF1(a,b) pow(AF1(a),AF1(b))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                         VECTOR OPS
+//------------------------------------------------------------------------------------------------------------------------------
+// These are added as needed for production or prototyping, so not necessarily a complete set.
+// They follow a convention of taking in a destination and also returning the destination value to increase utility.
+//==============================================================================================================================
+ #ifdef A_DUBL
+  AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;}
+  AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;}
+  AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;}
+  AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;}
+  AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;}
+  AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;}
+  AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;}
+  AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;}
+  AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;}
+  AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;}
+  AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;}
+  AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;}
+  AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;}
+  AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;}
+  AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;}
+  AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;}
+  AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;}
+  AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;}
+  AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;}
+  AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;}
+  AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;}
+  AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;}
+  AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+  AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;}
+  AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;}
+  AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;}
+ #endif
+//==============================================================================================================================
+ AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;}
+ AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;}
+ AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;}
+ AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;}
+ AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;}
+ AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;}
+ AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;}
+ AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;}
+ AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;}
+ AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;}
+ AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;}
+ AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;}
+ AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;}
+ AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;}
+ AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;}
+ AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;}
+ AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;}
+ AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;}
+ AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;}
+ AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;}
+ AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;}
+ AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;}
+ AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;}
+//------------------------------------------------------------------------------------------------------------------------------
+ AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;}
+ AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;}
+ AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;}
+#endif
+
+
+#define FSR_RCAS_F 1
+AU4 con0;
+
+AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); }
+void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {}
+
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//
+//                    AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629
+//
+//
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// FidelityFX Super Resolution Sample
+//
+// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// ABOUT
+// =====
+// FSR is a collection of algorithms relating to generating a higher resolution image.
+// This specific header focuses on single-image non-temporal image scaling, and related tools.
+// 
+// The core functions are EASU and RCAS:
+//  [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter.
+//  [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS.
+// RCAS needs to be applied after EASU as a separate pass.
+// 
+// Optional utility functions are:
+//  [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling.
+//  [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back.
+//  [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
+// See each individual sub-section for inline documentation.
+//------------------------------------------------------------------------------------------------------------------------------
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//------------------------------------------------------------------------------------------------------------------------------
+// FUNCTION PERMUTATIONS
+// =====================
+// *F() ..... Single item computation with 32-bit.
+// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible.
+// *Hx2() ... Processing two items in parallel with 16-bit, easier packing.
+//            Not all interfaces in this file have a *Hx2() form.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                        FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// EASU provides a high quality spatial-only scaling at relatively low cost.
+// Meaning EASU is appropiate for laptops and other low-end GPUs.
+// Quality from 1x to 4x area scaling is good.
+//------------------------------------------------------------------------------------------------------------------------------
+// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel.
+// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos.
+// This is also kept as simple as possible to have minimum runtime.
+//------------------------------------------------------------------------------------------------------------------------------
+// The lanzcos filter has negative lobes, so by itself it will introduce ringing.
+// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood,
+// and limits output to the minimum and maximum of that neighborhood.
+//------------------------------------------------------------------------------------------------------------------------------
+// Input image requirements:
+// 
+// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported)
+// Each channel needs to be in the range[0, 1]
+// Any color primaries are supported
+// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0)
+// There should be no banding in the input
+// There should be no high amplitude noise in the input
+// There should be no noise in the input that is not at input pixel granularity
+// For performance purposes, use 32bpp formats
+//------------------------------------------------------------------------------------------------------------------------------
+// Best to apply EASU at the end of the frame after tonemapping 
+// but before film grain or composite of the UI.
+//------------------------------------------------------------------------------------------------------------------------------
+// Example of including this header for D3D HLSL :
+// 
+//  #define A_GPU 1
+//  #define A_HLSL 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+// Example of including this header for Vulkan GLSL :
+// 
+//  #define A_GPU 1
+//  #define A_GLSL 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+// Example of including this header for Vulkan HLSL :
+// 
+//  #define A_GPU 1
+//  #define A_HLSL 1
+//  #define A_HLSL_6_2 1
+//  #define A_NO_16_BIT_CAST 1
+//  #define A_HALF 1
+//  #include "ffx_a.h"
+//  #define FSR_EASU_H 1
+//  #define FSR_RCAS_H 1
+//  //declare input callbacks
+//  #include "ffx_fsr1.h"
+// 
+//  Example of declaring the required input callbacks for GLSL :
+//  The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'.
+//  EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion.
+// 
+//  AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));}
+//  AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));}
+//  AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));}
+//  ...
+//  The FsrEasuCon function needs to be called from the CPU or GPU to set up constants.
+//  The difference in viewport and input image size is there to support Dynamic Resolution Scaling.
+//  To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1.
+//  Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer.
+//  AU4 con0,con1,con2,con3;
+//  FsrEasuCon(con0,con1,con2,con3,
+//    1920.0,1080.0,  // Viewport size (top left aligned) in the input image which is to be scaled.
+//    3840.0,2160.0,  // The size of the input image.
+//    2560.0,1440.0); // The output resolution.
+//==============================================================================================================================
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      CONSTANT SETUP
+//==============================================================================================================================
+// Call to setup required constant values (works on CPU or GPU).
+A_STATIC void FsrEasuCon(
+outAU4 con0,
+outAU4 con1,
+outAU4 con2,
+outAU4 con3,
+// This the rendered image resolution being upscaled
+AF1 inputViewportInPixelsX,
+AF1 inputViewportInPixelsY,
+// This is the resolution of the resource containing the input image (useful for dynamic resolution)
+AF1 inputSizeInPixelsX,
+AF1 inputSizeInPixelsY,
+// This is the display resolution which the input image gets upscaled to
+AF1 outputSizeInPixelsX,
+AF1 outputSizeInPixelsY){
+ // Output integer position to a pixel position in viewport.
+ con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX));
+ con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY));
+ con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5));
+ con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5));
+ // Viewport pixel position to normalized image space.
+ // This is used to get upper-left of 'F' tap.
+ con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX));
+ con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY));
+ // Centers of gather4, first offset from upper-left of 'F'.
+ //      +---+---+
+ //      |   |   |
+ //      +--(0)--+
+ //      | b | c |
+ //  +---F---+---+---+
+ //  | e | f | g | h |
+ //  +--(1)--+--(2)--+
+ //  | i | j | k | l |
+ //  +---+---+---+---+
+ //      | n | o |
+ //      +--(3)--+
+ //      |   |   |
+ //      +---+---+
+ con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
+ con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY));
+ // These are from (0) instead of 'F'.
+ con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX));
+ con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
+ con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX));
+ con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY));
+ con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX));
+ con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY));
+ con3[2]=con3[3]=0;}
+
+//If the an offset into the input image resource
+A_STATIC void FsrEasuConOffset(
+    outAU4 con0,
+    outAU4 con1,
+    outAU4 con2,
+    outAU4 con3,
+    // This the rendered image resolution being upscaled
+    AF1 inputViewportInPixelsX,
+    AF1 inputViewportInPixelsY,
+    // This is the resolution of the resource containing the input image (useful for dynamic resolution)
+    AF1 inputSizeInPixelsX,
+    AF1 inputSizeInPixelsY,
+    // This is the display resolution which the input image gets upscaled to
+    AF1 outputSizeInPixelsX,
+    AF1 outputSizeInPixelsY,
+    // This is the input image offset into the resource containing it (useful for dynamic resolution)
+    AF1 inputOffsetInPixelsX,
+    AF1 inputOffsetInPixelsY) {
+    FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY);
+    con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX);
+    con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                   NON-PACKED 32-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(FSR_EASU_F)
+ // Input callback prototypes, need to be implemented by calling shader
+ AF4 FsrEasuRF(AF2 p);
+ AF4 FsrEasuGF(AF2 p);
+ AF4 FsrEasuBF(AF2 p);
+//------------------------------------------------------------------------------------------------------------------------------
+ // Filtering for a given tap for the scalar.
+ void FsrEasuTapF(
+ inout AF3 aC, // Accumulated color, with negative lobe.
+ inout AF1 aW, // Accumulated weight.
+ AF2 off, // Pixel offset from resolve position to tap.
+ AF2 dir, // Gradient direction.
+ AF2 len, // Length.
+ AF1 lob, // Negative lobe strength.
+ AF1 clp, // Clipping point.
+ AF3 c){ // Tap color.
+  // Rotate offset by direction.
+  AF2 v;
+  v.x=(off.x*( dir.x))+(off.y*dir.y);
+  v.y=(off.x*(-dir.y))+(off.y*dir.x);
+  // Anisotropy.
+  v*=len;
+  // Compute distance^2.
+  AF1 d2=v.x*v.x+v.y*v.y;
+  // Limit to the window as at corner, 2 taps can easily be outside.
+  d2=min(d2,clp);
+  // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x.
+  //  (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2
+  //  |_______________________________________|   |_______________|
+  //                   base                             window
+  // The general form of the 'base' is,
+  //  (a*(b*x^2-1)^2-(a-1))
+  // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe.
+  AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0);
+  AF1 wA=lob*d2+AF1_(-1.0);
+  wB*=wB;
+  wA*=wA;
+  wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0));
+  AF1 w=wB*wA;
+  // Do weighted average.
+  aC+=c*w;aW+=w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Accumulate direction and length.
+ void FsrEasuSetF(
+ inout AF2 dir,
+ inout AF1 len,
+ AF2 pp,
+ AP1 biS,AP1 biT,AP1 biU,AP1 biV,
+ AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){
+  // Compute bilinear weight, branches factor out as predicates are compiler time immediates.
+  //  s t
+  //  u v
+  AF1 w = AF1_(0.0);
+  if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y);
+  if(biT)w=           pp.x *(AF1_(1.0)-pp.y);
+  if(biU)w=(AF1_(1.0)-pp.x)*           pp.y ;
+  if(biV)w=           pp.x *           pp.y ;
+  // Direction is the '+' diff.
+  //    a
+  //  b c d
+  //    e
+  // Then takes magnitude from abs average of both sides of 'c'.
+  // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms.
+  AF1 dc=lD-lC;
+  AF1 cb=lC-lB;
+  AF1 lenX=max(abs(dc),abs(cb));
+  lenX=APrxLoRcpF1(lenX);
+  AF1 dirX=lD-lB;
+  dir.x+=dirX*w;
+  lenX=ASatF1(abs(dirX)*lenX);
+  lenX*=lenX;
+  len+=lenX*w;
+  // Repeat for the y axis.
+  AF1 ec=lE-lC;
+  AF1 ca=lC-lA;
+  AF1 lenY=max(abs(ec),abs(ca));
+  lenY=APrxLoRcpF1(lenY);
+  AF1 dirY=lE-lA;
+  dir.y+=dirY*w;
+  lenY=ASatF1(abs(dirY)*lenY);
+  lenY*=lenY;
+  len+=lenY*w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrEasuF(
+ out AF3 pix,
+ AU2 ip, // Integer pixel position in output.
+ AU4 con0, // Constants generated by FsrEasuCon().
+ AU4 con1,
+ AU4 con2,
+ AU4 con3){
+//------------------------------------------------------------------------------------------------------------------------------
+  // Get position of 'f'.
+  AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
+  AF2 fp=floor(pp);
+  pp-=fp;
+//------------------------------------------------------------------------------------------------------------------------------
+  // 12-tap kernel.
+  //    b c
+  //  e f g h
+  //  i j k l
+  //    n o
+  // Gather 4 ordering.
+  //  a b
+  //  r g
+  // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions,
+  //    a b    <- unused (z)
+  //    r g
+  //  a b a b
+  //  r g r g
+  //    a b
+  //    r g    <- unused (z)
+  // Allowing dead-code removal to remove the 'z's.
+  AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
+  // These are from p0 to avoid pulling two constants on pre-Navi hardware.
+  AF2 p1=p0+AF2_AU2(con2.xy);
+  AF2 p2=p0+AF2_AU2(con2.zw);
+  AF2 p3=p0+AF2_AU2(con3.xy);
+  AF4 bczzR=FsrEasuRF(p0);
+  AF4 bczzG=FsrEasuGF(p0);
+  AF4 bczzB=FsrEasuBF(p0);
+  AF4 ijfeR=FsrEasuRF(p1);
+  AF4 ijfeG=FsrEasuGF(p1);
+  AF4 ijfeB=FsrEasuBF(p1);
+  AF4 klhgR=FsrEasuRF(p2);
+  AF4 klhgG=FsrEasuGF(p2);
+  AF4 klhgB=FsrEasuBF(p2);
+  AF4 zzonR=FsrEasuRF(p3);
+  AF4 zzonG=FsrEasuGF(p3);
+  AF4 zzonB=FsrEasuBF(p3);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD).
+  AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG);
+  AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG);
+  AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG);
+  AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG);
+  // Rename.
+  AF1 bL=bczzL.x;
+  AF1 cL=bczzL.y;
+  AF1 iL=ijfeL.x;
+  AF1 jL=ijfeL.y;
+  AF1 fL=ijfeL.z;
+  AF1 eL=ijfeL.w;
+  AF1 kL=klhgL.x;
+  AF1 lL=klhgL.y;
+  AF1 hL=klhgL.z;
+  AF1 gL=klhgL.w;
+  AF1 oL=zzonL.z;
+  AF1 nL=zzonL.w;
+  // Accumulate for bilinear interpolation.
+  AF2 dir=AF2_(0.0);
+  AF1 len=AF1_(0.0);
+  FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL);
+  FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL);
+  FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL);
+  FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Normalize with approximation, and cleanup close to zero.
+  AF2 dir2=dir*dir;
+  AF1 dirR=dir2.x+dir2.y;
+  AP1 zro=dirR<AF1_(1.0/32768.0);
+  dirR=APrxLoRsqF1(dirR);
+  dirR=zro?AF1_(1.0):dirR;
+  dir.x=zro?AF1_(1.0):dir.x;
+  dir*=AF2_(dirR);
+  // Transform from {0 to 2} to {0 to 1} range, and shape with square.
+  len=len*AF1_(0.5);
+  len*=len;
+  // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}.
+  AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y)));
+  // Anisotropic length after rotation,
+  //  x := 1.0 lerp to 'stretch' on edges
+  //  y := 1.0 lerp to 2x on edges
+  AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len);
+  // Based on the amount of 'edge',
+  // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}.
+  AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len;
+  // Set distance^2 clipping point to the end of the adjustable window.
+  AF1 clp=APrxLoRcpF1(lob);
+//------------------------------------------------------------------------------------------------------------------------------
+  // Accumulation mixed with min/max of 4 nearest.
+  //    b c
+  //  e f g h
+  //  i j k l
+  //    n o
+  AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
+               AF3(klhgR.x,klhgG.x,klhgB.x));
+  AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)),
+               AF3(klhgR.x,klhgG.x,klhgB.x));
+  // Accumulation.
+  AF3 aC=AF3_(0.0);
+  AF1 aW=AF1_(0.0);
+  FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b
+  FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c
+  FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i
+  FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j
+  FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f
+  FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e
+  FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k
+  FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l
+  FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h
+  FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g
+  FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o
+  FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n
+//------------------------------------------------------------------------------------------------------------------------------
+  // Normalize and dering.
+  pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                    PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H)
+// Input callback prototypes, need to be implemented by calling shader
+ AH4 FsrEasuRH(AF2 p);
+ AH4 FsrEasuGH(AF2 p);
+ AH4 FsrEasuBH(AF2 p);
+//------------------------------------------------------------------------------------------------------------------------------
+ // This runs 2 taps in parallel.
+ void FsrEasuTapH(
+ inout AH2 aCR,inout AH2 aCG,inout AH2 aCB,
+ inout AH2 aW,
+ AH2 offX,AH2 offY,
+ AH2 dir,
+ AH2 len,
+ AH1 lob,
+ AH1 clp,
+ AH2 cR,AH2 cG,AH2 cB){
+  AH2 vX,vY;
+  vX=offX*  dir.xx +offY*dir.yy;
+  vY=offX*(-dir.yy)+offY*dir.xx;
+  vX*=len.x;vY*=len.y;
+  AH2 d2=vX*vX+vY*vY;
+  d2=min(d2,AH2_(clp));
+  AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0);
+  AH2 wA=AH2_(lob)*d2+AH2_(-1.0);
+  wB*=wB;
+  wA*=wA;
+  wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0));
+  AH2 w=wB*wA;
+  aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This runs 2 taps in parallel.
+ void FsrEasuSetH(
+ inout AH2 dirPX,inout AH2 dirPY,
+ inout AH2 lenP,
+ AH2 pp,
+ AP1 biST,AP1 biUV,
+ AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){
+  AH2 w = AH2_(0.0);
+  if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y);
+  if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(          pp.y);
+  // ABS is not free in the packed FP16 path.
+  AH2 dc=lD-lC;
+  AH2 cb=lC-lB;
+  AH2 lenX=max(abs(dc),abs(cb));
+  lenX=ARcpH2(lenX);
+  AH2 dirX=lD-lB;
+  dirPX+=dirX*w;
+  lenX=ASatH2(abs(dirX)*lenX);
+  lenX*=lenX;
+  lenP+=lenX*w;
+  AH2 ec=lE-lC;
+  AH2 ca=lC-lA;
+  AH2 lenY=max(abs(ec),abs(ca));
+  lenY=ARcpH2(lenY);
+  AH2 dirY=lE-lA;
+  dirPY+=dirY*w;
+  lenY=ASatH2(abs(dirY)*lenY);
+  lenY*=lenY;
+  lenP+=lenY*w;}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrEasuH(
+ out AH3 pix,
+ AU2 ip,
+ AU4 con0,
+ AU4 con1,
+ AU4 con2,
+ AU4 con3){
+//------------------------------------------------------------------------------------------------------------------------------
+  AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw);
+  AF2 fp=floor(pp);
+  pp-=fp;
+  AH2 ppp=AH2(pp);
+//------------------------------------------------------------------------------------------------------------------------------
+  AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw);
+  AF2 p1=p0+AF2_AU2(con2.xy);
+  AF2 p2=p0+AF2_AU2(con2.zw);
+  AF2 p3=p0+AF2_AU2(con3.xy);
+  AH4 bczzR=FsrEasuRH(p0);
+  AH4 bczzG=FsrEasuGH(p0);
+  AH4 bczzB=FsrEasuBH(p0);
+  AH4 ijfeR=FsrEasuRH(p1);
+  AH4 ijfeG=FsrEasuGH(p1);
+  AH4 ijfeB=FsrEasuBH(p1);
+  AH4 klhgR=FsrEasuRH(p2);
+  AH4 klhgG=FsrEasuGH(p2);
+  AH4 klhgB=FsrEasuBH(p2);
+  AH4 zzonR=FsrEasuRH(p3);
+  AH4 zzonG=FsrEasuGH(p3);
+  AH4 zzonB=FsrEasuBH(p3);
+//------------------------------------------------------------------------------------------------------------------------------
+  AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG);
+  AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG);
+  AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG);
+  AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG);
+  AH1 bL=bczzL.x;
+  AH1 cL=bczzL.y;
+  AH1 iL=ijfeL.x;
+  AH1 jL=ijfeL.y;
+  AH1 fL=ijfeL.z;
+  AH1 eL=ijfeL.w;
+  AH1 kL=klhgL.x;
+  AH1 lL=klhgL.y;
+  AH1 hL=klhgL.z;
+  AH1 gL=klhgL.w;
+  AH1 oL=zzonL.z;
+  AH1 nL=zzonL.w;
+  // This part is different, accumulating 2 taps in parallel.
+  AH2 dirPX=AH2_(0.0);
+  AH2 dirPY=AH2_(0.0);
+  AH2 lenP=AH2_(0.0);
+  FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL));
+  FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL));
+  AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g);
+  AH1 len=lenP.r+lenP.g;
+//------------------------------------------------------------------------------------------------------------------------------
+  AH2 dir2=dir*dir;
+  AH1 dirR=dir2.x+dir2.y;
+  AP1 zro=dirR<AH1_(1.0/32768.0);
+  dirR=APrxLoRsqH1(dirR);
+  dirR=zro?AH1_(1.0):dirR;
+  dir.x=zro?AH1_(1.0):dir.x;
+  dir*=AH2_(dirR);
+  len=len*AH1_(0.5);
+  len*=len;
+  AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y)));
+  AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len);
+  AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len;
+  AH1 clp=APrxLoRcpH1(lob);
+//------------------------------------------------------------------------------------------------------------------------------
+  // FP16 is different, using packed trick to do min and max in same operation.
+  AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x)));
+  AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x)));
+  AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x)));
+  // This part is different for FP16, working pairs of taps at a time.
+  AH2 pR=AH2_(0.0);
+  AH2 pG=AH2_(0.0);
+  AH2 pB=AH2_(0.0);
+  AH2 pW=AH2_(0.0);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw);
+  FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw);
+  AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y);
+  AH1 aW=pW.x+pW.y;
+//------------------------------------------------------------------------------------------------------------------------------
+  // Slightly different for FP16 version due to combined min and max.
+  pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                      FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness.
+// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping.
+// RCAS also has a built in process to limit sharpening of what it detects as possible noise.
+// RCAS sharper does not support scaling, as it should be applied after EASU scaling.
+// Pass EASU output straight into RCAS, no color conversions necessary.
+//------------------------------------------------------------------------------------------------------------------------------
+// RCAS is based on the following logic.
+// RCAS uses a 5 tap filter in a cross pattern (same as CAS),
+//    w                n
+//  w 1 w  for taps  w m e 
+//    w                s
+// Where 'w' is the negative lobe weight.
+//  output = (w*(n+e+w+s)+m)/(4*w+1)
+// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range,
+//  0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s)
+//  1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1)
+// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount.
+// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues.
+// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps.
+// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation.
+// This stabilizes RCAS.
+// RCAS does a simple highpass which is normalized against the local contrast then shaped,
+//       0.25
+//  0.25  -1  0.25
+//       0.25
+// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges.
+//
+//  GLSL example for the required callbacks :
+// 
+//  AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));}
+//  void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b)
+//  {
+//    //do any simple input color conversions here or leave empty if none needed
+//  }
+//  
+//  FsrRcasCon need to be called from the CPU or GPU to set up constants.
+//  Including a GPU example here, the 'con' value would be stored out to a constant buffer.
+// 
+//  AU4 con;
+//  FsrRcasCon(con,
+//   0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
+// ---------------
+// RCAS sharpening supports a CAS-like pass-through alpha via,
+//  #define FSR_RCAS_PASSTHROUGH_ALPHA 1
+// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise.
+// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define,
+//  #define FSR_RCAS_DENOISE 1
+//==============================================================================================================================
+// This is set at the limit of providing unnatural results for sharpening.
+#define FSR_RCAS_LIMIT (0.25-(1.0/16.0))
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                      CONSTANT SETUP
+//==============================================================================================================================
+// Call to setup required constant values (works on CPU or GPU).
+A_STATIC void FsrRcasCon(
+outAU4 con,
+// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}.
+AF1 sharpness){
+ // Transform from stops to linear value.
+ sharpness=AExp2F1(-sharpness);
+ varAF2(hSharp)=initAF2(sharpness,sharpness);
+ con[0]=AU1_AF1(sharpness);
+ con[1]=AU1_AH2_AF2(hSharp);
+ con[2]=0;
+ con[3]=0;}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                   NON-PACKED 32-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(FSR_RCAS_F)
+ // Input callback prototypes that need to be implemented by calling shader
+ AF4 FsrRcasLoadF(ASU2 p);
+ void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasF(
+ out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
+ out AF1 pixG,
+ out AF1 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AF1 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // Algorithm uses minimal 3x3 pixel neighborhood.
+  //    b 
+  //  d e f
+  //    h
+  ASU2 sp=ASU2(ip);
+  AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb;
+  AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AF4 ee=FsrRcasLoadF(sp);
+   AF3 e=ee.rgb;pixA=ee.a;
+  #else
+   AF3 e=FsrRcasLoadF(sp).rgb;
+  #endif
+  AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb;
+  AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb;
+  // Rename (32-bit) or regroup (16-bit).
+  AF1 bR=b.r;
+  AF1 bG=b.g;
+  AF1 bB=b.b;
+  AF1 dR=d.r;
+  AF1 dG=d.g;
+  AF1 dB=d.b;
+  AF1 eR=e.r;
+  AF1 eG=e.g;
+  AF1 eB=e.b;
+  AF1 fR=f.r;
+  AF1 fG=f.g;
+  AF1 fB=f.b;
+  AF1 hR=h.r;
+  AF1 hG=h.g;
+  AF1 hB=h.b;
+  // Run optional input transform.
+  FsrRcasInputF(bR,bG,bB);
+  FsrRcasInputF(dR,dG,dB);
+  FsrRcasInputF(eR,eG,eB);
+  FsrRcasInputF(fR,fG,fB);
+  FsrRcasInputF(hR,hG,hB);
+  // Luma times 2.
+  AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG);
+  AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG);
+  AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG);
+  AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG);
+  AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG);
+  // Noise detection.
+  AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL;
+  nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL)));
+  nz=AF1_(-0.5)*nz+AF1_(1.0);
+  // Min and max of ring.
+  AF1 mn4R=min(AMin3F1(bR,dR,fR),hR);
+  AF1 mn4G=min(AMin3F1(bG,dG,fG),hG);
+  AF1 mn4B=min(AMin3F1(bB,dB,fB),hB);
+  AF1 mx4R=max(AMax3F1(bR,dR,fR),hR);
+  AF1 mx4G=max(AMax3F1(bG,dG,fG),hG);
+  AF1 mx4B=max(AMax3F1(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AF2 peakC=AF2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R);
+  AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G);
+  AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B);
+  AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
+  AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
+  AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
+  AF1 lobeR=max(-hitMinR,hitMaxR);
+  AF1 lobeG=max(-hitMinG,hitMaxG);
+  AF1 lobeB=max(-hitMinB,hitMaxB);
+  AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x);
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;
+  return;} 
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                  NON-PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H)
+ // Input callback prototypes that need to be implemented by calling shader
+ AH4 FsrRcasLoadH(ASW2 p);
+ void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasH(
+ out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy.
+ out AH1 pixG,
+ out AH1 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AH1 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // Sharpening algorithm uses minimal 3x3 pixel neighborhood.
+  //    b 
+  //  d e f
+  //    h
+  ASW2 sp=ASW2(ip);
+  AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb;
+  AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee=FsrRcasLoadH(sp);
+   AH3 e=ee.rgb;pixA=ee.a;
+  #else
+   AH3 e=FsrRcasLoadH(sp).rgb;
+  #endif
+  AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb;
+  AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb;
+  // Rename (32-bit) or regroup (16-bit).
+  AH1 bR=b.r;
+  AH1 bG=b.g;
+  AH1 bB=b.b;
+  AH1 dR=d.r;
+  AH1 dG=d.g;
+  AH1 dB=d.b;
+  AH1 eR=e.r;
+  AH1 eG=e.g;
+  AH1 eB=e.b;
+  AH1 fR=f.r;
+  AH1 fG=f.g;
+  AH1 fB=f.b;
+  AH1 hR=h.r;
+  AH1 hG=h.g;
+  AH1 hB=h.b;
+  // Run optional input transform.
+  FsrRcasInputH(bR,bG,bB);
+  FsrRcasInputH(dR,dG,dB);
+  FsrRcasInputH(eR,eG,eB);
+  FsrRcasInputH(fR,fG,fB);
+  FsrRcasInputH(hR,hG,hB);
+  // Luma times 2.
+  AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG);
+  AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG);
+  AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG);
+  AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG);
+  AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG);
+  // Noise detection.
+  AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL;
+  nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL)));
+  nz=AH1_(-0.5)*nz+AH1_(1.0);
+  // Min and max of ring.
+  AH1 mn4R=min(AMin3H1(bR,dR,fR),hR);
+  AH1 mn4G=min(AMin3H1(bG,dG,fG),hG);
+  AH1 mn4B=min(AMin3H1(bB,dB,fB),hB);
+  AH1 mx4R=max(AMax3H1(bR,dR,fR),hR);
+  AH1 mx4G=max(AMax3H1(bG,dG,fG),hG);
+  AH1 mx4B=max(AMax3H1(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AH2 peakC=AH2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R);
+  AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G);
+  AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B);
+  AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
+  AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
+  AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
+  AH1 lobeR=max(-hitMinR,hitMaxR);
+  AH1 lobeG=max(-hitMinG,hitMaxG);
+  AH1 lobeB=max(-hitMinB,hitMaxB);
+  AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x;
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//                                                     PACKED 16-BIT VERSION
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2)
+ // Input callback prototypes that need to be implemented by the calling shader
+ AH4 FsrRcasLoadHx2(ASW2 p);
+ void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b);
+//------------------------------------------------------------------------------------------------------------------------------
+ // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store.
+ void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){
+  #ifdef A_HLSL
+   // Invoke a slower path for DX only, since it won't allow uninitialized values.
+   pix0.a=pix1.a=0.0;
+  #endif
+  pix0.rgb=AH3(pixR.x,pixG.x,pixB.x);
+  pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrRcasHx2(
+ // Output values are for 2 8x8 tiles in a 16x8 region.
+ //  pix<R,G,B>.x =  left 8x8 tile
+ //  pix<R,G,B>.y = right 8x8 tile
+ // This enables later processing to easily be packed as well.
+ out AH2 pixR,
+ out AH2 pixG,
+ out AH2 pixB,
+ #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+  out AH2 pixA,
+ #endif
+ AU2 ip, // Integer pixel position in output.
+ AU4 con){ // Constant generated by RcasSetup().
+  // No scaling algorithm uses minimal 3x3 pixel neighborhood.
+  ASW2 sp0=ASW2(ip);
+  AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb;
+  AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee0=FsrRcasLoadHx2(sp0);
+   AH3 e0=ee0.rgb;pixA.r=ee0.a;
+  #else
+   AH3 e0=FsrRcasLoadHx2(sp0).rgb;
+  #endif
+  AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb;
+  AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb;
+  ASW2 sp1=sp0+ASW2(8,0);
+  AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb;
+  AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb;
+  #ifdef FSR_RCAS_PASSTHROUGH_ALPHA
+   AH4 ee1=FsrRcasLoadHx2(sp1);
+   AH3 e1=ee1.rgb;pixA.g=ee1.a;
+  #else
+   AH3 e1=FsrRcasLoadHx2(sp1).rgb;
+  #endif
+  AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb;
+  AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb;
+  // Arrays of Structures to Structures of Arrays conversion.
+  AH2 bR=AH2(b0.r,b1.r);
+  AH2 bG=AH2(b0.g,b1.g);
+  AH2 bB=AH2(b0.b,b1.b);
+  AH2 dR=AH2(d0.r,d1.r);
+  AH2 dG=AH2(d0.g,d1.g);
+  AH2 dB=AH2(d0.b,d1.b);
+  AH2 eR=AH2(e0.r,e1.r);
+  AH2 eG=AH2(e0.g,e1.g);
+  AH2 eB=AH2(e0.b,e1.b);
+  AH2 fR=AH2(f0.r,f1.r);
+  AH2 fG=AH2(f0.g,f1.g);
+  AH2 fB=AH2(f0.b,f1.b);
+  AH2 hR=AH2(h0.r,h1.r);
+  AH2 hG=AH2(h0.g,h1.g);
+  AH2 hB=AH2(h0.b,h1.b);
+  // Run optional input transform.
+  FsrRcasInputHx2(bR,bG,bB);
+  FsrRcasInputHx2(dR,dG,dB);
+  FsrRcasInputHx2(eR,eG,eB);
+  FsrRcasInputHx2(fR,fG,fB);
+  FsrRcasInputHx2(hR,hG,hB);
+  // Luma times 2.
+  AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG);
+  AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG);
+  AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG);
+  AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG);
+  AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG);
+  // Noise detection.
+  AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL;
+  nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL)));
+  nz=AH2_(-0.5)*nz+AH2_(1.0);
+  // Min and max of ring.
+  AH2 mn4R=min(AMin3H2(bR,dR,fR),hR);
+  AH2 mn4G=min(AMin3H2(bG,dG,fG),hG);
+  AH2 mn4B=min(AMin3H2(bB,dB,fB),hB);
+  AH2 mx4R=max(AMax3H2(bR,dR,fR),hR);
+  AH2 mx4G=max(AMax3H2(bG,dG,fG),hG);
+  AH2 mx4B=max(AMax3H2(bB,dB,fB),hB);
+  // Immediate constants for peak range.
+  AH2 peakC=AH2(1.0,-1.0*4.0);
+  // Limiters, these need to be high precision RCPs.
+  AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R);
+  AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G);
+  AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B);
+  AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
+  AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
+  AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
+  AH2 lobeR=max(-hitMinR,hitMaxR);
+  AH2 lobeG=max(-hitMinG,hitMaxG);
+  AH2 lobeB=max(-hitMinB,hitMaxB);
+  AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x);
+  // Apply noise removal.
+  #ifdef FSR_RCAS_DENOISE
+   lobe*=nz;
+  #endif
+  // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes.
+  AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0));
+  pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL;
+  pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL;
+  pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                          FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts.
+// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel.
+// The 'Lfga*()' functions provide a convenient way to introduce grain.
+// These functions limit grain based on distance to signal limits.
+// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality.
+// Grain application should be done in a linear colorspace.
+// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased).
+//------------------------------------------------------------------------------------------------------------------------------
+// Usage,
+//   FsrLfga*(
+//    color, // In/out linear colorspace color {0 to 1} ranged.
+//    grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain.
+//    amount); // Amount of grain (0 to 1} ranged.
+//------------------------------------------------------------------------------------------------------------------------------
+// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)'
+//==============================================================================================================================
+#if defined(A_GPU)
+ // Maximum grain is the minimum distance to the signal limit.
+ void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ // Half precision version (slower).
+ void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // Packed half precision version (faster).
+ void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){
+  cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                          FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear.
+// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering.
+//------------------------------------------------------------------------------------------------------------------------------
+// Reversible tonemapper usage,
+//  FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}.
+//  FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}.
+//==============================================================================================================================
+#if defined(A_GPU)
+ void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));}
+ // The extra max solves the c=1.0 case (which is a /0).
+ void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));}
+ void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
+  AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;}
+ void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){
+  AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//_____________________________________________________________/\_______________________________________________________________
+//==============================================================================================================================
+//
+//                                       FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER
+//
+//------------------------------------------------------------------------------------------------------------------------------
+// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion.
+// Gamma 2.0 is used so that the conversion back to linear is just to square the color.
+// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively.
+// Given good non-biased temporal blue noise as dither input,
+// the output dither will temporally conserve energy.
+// This is done by choosing the linear nearest step point instead of perceptual nearest.
+// See code below for details.
+//------------------------------------------------------------------------------------------------------------------------------
+// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION
+// ===============================================
+// - Output is 'uint(floor(saturate(n)*255.0+0.5))'.
+// - Thus rounding is to nearest.
+// - NaN gets converted to zero.
+// - INF is clamped to {0.0 to 1.0}.
+//==============================================================================================================================
+#if defined(A_GPU)
+ // Hand tuned integer position to dither value, with more values than simple checkerboard.
+ // Only 32-bit has enough precision for this compddation.
+ // Output is {0 to <1}.
+ AF1 FsrTepdDitF(AU2 p,AU1 f){
+  AF1 x=AF1_(p.x+f);
+  AF1 y=AF1_(p.y);
+  // The 1.61803 golden ratio.
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  // Number designed to provide a good visual pattern.
+  AF1 b=AF1_(1.0/3.69);
+  x=x*a+(y*b);
+  return AFractF1(x);}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This version is 8-bit gamma 2.0.
+ // The 'c' input is {0 to 1}.
+ // Output is {0 to 1} ready for image store.
+ void FsrTepdC8F(inout AF3 c,AF1 dit){
+  AF3 n=sqrt(c);
+  n=floor(n*AF3_(255.0))*AF3_(1.0/255.0);
+  AF3 a=n*n;
+  AF3 b=n+AF3_(1.0/255.0);b=b*b;
+  // Ratio of 'a' to 'b' required to produce 'c'.
+  // APrxLoRcpF1() won't work here (at least for very high dynamic ranges).
+  // APrxMedRcpF1() is an IADD,FMA,MUL.
+  AF3 r=(c-b)*APrxMedRcpF3(a-b);
+  // Use the ratio as a cutoff to choose 'a' or 'b'.
+  // AGtZeroF1() is a MUL.
+  c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ // This version is 10-bit gamma 2.0.
+ // The 'c' input is {0 to 1}.
+ // Output is {0 to 1} ready for image store.
+ void FsrTepdC10F(inout AF3 c,AF1 dit){
+  AF3 n=sqrt(c);
+  n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0);
+  AF3 a=n*n;
+  AF3 b=n+AF3_(1.0/1023.0);b=b*b;
+  AF3 r=(c-b)*APrxMedRcpF3(a-b);
+  c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));}
+#endif
+//==============================================================================================================================
+#if defined(A_GPU)&&defined(A_HALF)
+ AH1 FsrTepdDitH(AU2 p,AU1 f){
+  AF1 x=AF1_(p.x+f);
+  AF1 y=AF1_(p.y);
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  AF1 b=AF1_(1.0/3.69);
+  x=x*a+(y*b);
+  return AH1(AFractF1(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC8H(inout AH3 c,AH1 dit){
+  AH3 n=sqrt(c);
+  n=floor(n*AH3_(255.0))*AH3_(1.0/255.0);
+  AH3 a=n*n;
+  AH3 b=n+AH3_(1.0/255.0);b=b*b;
+  AH3 r=(c-b)*APrxMedRcpH3(a-b);
+  c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC10H(inout AH3 c,AH1 dit){
+  AH3 n=sqrt(c);
+  n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0);
+  AH3 a=n*n;
+  AH3 b=n+AH3_(1.0/1023.0);b=b*b;
+  AH3 r=(c-b)*APrxMedRcpH3(a-b);
+  c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));}
+//==============================================================================================================================
+ // This computes dither for positions 'p' and 'p+{8,0}'.
+ AH2 FsrTepdDitHx2(AU2 p,AU1 f){
+  AF2 x;
+  x.x=AF1_(p.x+f);
+  x.y=x.x+AF1_(8.0);
+  AF1 y=AF1_(p.y);
+  AF1 a=AF1_((1.0+sqrt(5.0))/2.0);
+  AF1 b=AF1_(1.0/3.69);
+  x=x*AF2_(a)+AF2_(y*b);
+  return AH2(AFractF2(x));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
+  AH2 nR=sqrt(cR);
+  AH2 nG=sqrt(cG);
+  AH2 nB=sqrt(cB);
+  nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0);
+  nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0);
+  nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0);
+  AH2 aR=nR*nR;
+  AH2 aG=nG*nG;
+  AH2 aB=nB*nB;
+  AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR;
+  AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG;
+  AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB;
+  AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
+  AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
+  AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
+  cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0));
+  cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0));
+  cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));}
+//------------------------------------------------------------------------------------------------------------------------------
+ void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){
+  AH2 nR=sqrt(cR);
+  AH2 nG=sqrt(cG);
+  AH2 nB=sqrt(cB);
+  nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0);
+  nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0);
+  nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0);
+  AH2 aR=nR*nR;
+  AH2 aG=nG*nG;
+  AH2 aB=nB*nB;
+  AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR;
+  AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG;
+  AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB;
+  AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR);
+  AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG);
+  AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB);
+  cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0));
+  cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0));
+  cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));}
+#endif
+
+
+void CurrFilter(AU2 pos)
+{
+    AF3 c;
+    FsrRcasF(c.r, c.g, c.b, pos, con0);
+    imageStore(imgOutput, ASU2(pos), AF4(c, 1));
+}
+
+void main() {
+	FsrRcasCon(con0, sharpening_data);    
+    
+	AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u);
+    CurrFilter(gxy);
+	gxy.x += 8u;
+	CurrFilter(gxy);
+	gxy.y += 8u;
+	CurrFilter(gxy);
+	gxy.x -= 8u;
+	CurrFilter(gxy);
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv
new file mode 100644
index 0000000000000000000000000000000000000000..b2e30e1fe82d1a17c9b837a184d99a88f1c112bb
GIT binary patch
literal 20472
zcmZvj37nNx`Nv<F83qv%MNt%i0YOwmQB-hcQ4j?|aH%YD*hgW8hJm0^QE@9ZHM3l*
z(KORsS~N>j$;`DZGxsIRT++%V%U%E9@4fFcoH?KWjpO<Kp5OC4=Q-!R@4fH6XR7Hs
zq`oM+6dM=ai(PAq%4@TtE0zLVw@N>G>a3|Fm$l6sIeO3Cbm&!79rf9?s4wd1YiUaw
zo0{p0PrAZ!=}o-$&1mCj`rB&wA1=jBkUBil*g7md(8f<1HGAdo1*7%_*WterePadJ
zkr(|a_2nJ<rX6~>qObad1@o1w|Ef9qU|-fcw~}uQaLa-PmGQR(&uMB~MhEX!Y)emk
z)3hXkHEfMN;`@P{nwM4Lw*ha6uP+*mpVqi?k4dAPLG>Rta}1zgS>l28t4chG{>v3y
zUksxkKdq_x|BD}j7V+DIBYr5j8edoJX#C8%ORK|I#~t1gS67TspViow>N|Jn^~J90
zGg?<pn?EnfQq40GE%srwelwRW9kX&wWhb299vympu^)KSveub%8<$OOX`DA{_==|8
zV?XVe((1VGCT#GV(!6we+oWXG%y}SsYsI&wavd!!@p$?<6<l9Tp)bdp*wQ?F#q!av
zd$s1tXmeYd8LO^11iWlfW9!o9`ODn!I<Bml8)B28S1jLsU(@JQTTH`eY12wwEBx>(
zH+6+iuX3jXuPJ8GH!(r+5#Z|h-HM~Z6PLHPPHI}*HouiRSrL7X!oIYndOe*aZftE_
zLhsmTWc;j7yrwu8UR{UU;`~lpZE;z~ugdt<8NVju*Jk{>j9;Je8!~=l#&6E}ZSZ9+
z%UkEpug>)W_@Y^EP<8E|P%rVb;F#xn@P;+2DPDxnqq$zMWZG+R(pR+CGwludf>L`Y
z)82(IDzy(Y?GyMMt_Jt6OBXO#g1A=9RUobh&nta;h)bVM#HCMf@ch!JkGS;dD=vNd
zfftlM{l%rvKym3a7`&+T87eM)hKnmcwZ+ch4Qp0g?2_?aGrniW_s;m(j31cs@fn|(
z@hKTUEaTHMJ}cu#W&D^<yr%d9e9qMBz8~L7t1aer;x)wy@Oh>0;!av^v9uGfDOSMe
zm%b}IX|=`bPQ0c#4ZfiCUE4{kEza)5Yl`#Xi%Q=MI%&1V#hrLfaT&asH+--6D>`Yl
zMSI4t$@ujd|5?Uw&G>B@zdhr3Wc<#Iugmy78NWB<_h<Zpj6am|M>76s#{Zo0r!xL5
zd`a_Y?w;y;K9}h)WPE+bU#iA)$5rEB&h%F^{#wT0sK#@bR^#8y^tUtqPR8Fie&vSx
zKgjftGX6=%Yr0nAJL<26r|%QpGTuGon^fZ`ME$)oy?4ep&-hmI`Hd$|Ea$%sd{I*y
zPe3y(*Z088e@MoMb@)%N`0teIBQrj#!+%1>f6q)GoALch|3*Ikz27Eg`k@)0*5O~R
z=g3SyHsi;2_*d(hpXrM;-U46TGRKdNN<D2QcLS|&Rmmrm{N$2DEBYFEy1%Dod~L?h
z%=kGOzYyL!cPZ~&)%+J_`Xw2^v=gr>9*55@W1keaH18I<p6cXVTRffdXEOe5#-H!R
z>xx(5O-mL|U*5)pW@XH};w@~IXTtf-P0b4nz5&!%Y}T;BK6_qcTO;dNQ~U>iU#|5#
zo$}Wf|Lw$UiVxunSFXahd*yr4!o{<vw#;o@Jf(RBU0YL2^OS=k*QcFg`R-Ih?BdzS
zw6vZ$xwU2aQav5B7xr}j`euB)jBnqGcPn;;H#M)AIe%Hp;^me=0o|xiyRS}Kb=sAA
z?_BDBM;ZZcS>Dzui8-|U>a1;8yOE_nnR)o^H<-q>i{K4l-kvJly;3(sU9Rry5Gr!t
zd=05~_|#Nvn)P>|26t#xZhUR2`#p7I@U|s)z2)1Ke0a(IuIG5MKIwScVm#L;9Z&9f
z>c+>~r19D!-nB{N<;K$$eelw+0VWss1$CnzRB5$bwMC_AYhae$>BZW&g!iN$MDsNW
zz6rgWcKv&mSbGDJn<rdb*tY{m-WYQ$;zyPK)jDg7-Qn7dbNs#N2hqCF^xvCaoBq}_
zhTeSI)yL91j{9YO`_OCC&-xCaH#TapH_jW%IT>f(gXz_@H?R+r=#2}zK8L~evEBw&
z&Ns6>w?4*L%aNt7ZD+Xi@VkO9xpl9uur&t;Ut8f--TKa>efqt(P7eLjsy<Gx-cEC^
zf?o-C@B8tc^%i=^P)}dMx5e{T1pS=f&r2t5`<C4M&zEa<AHDfvFYX7s9@=C4N5H;<
zKL*y{_^0WOw?4;vhTeGX`nx{Q(>~=jf<17LU!b{W+Qarz$+bt^TVUrEK5v7y=@Yj1
zORn8GYnfeLjE?ocyu#NU5c*?P{qEpfJDFn5I&(VjesK3td-x2f`iu*HN`;TR%Q-&*
zc3iJB*I4f7nKt+A<q}ti#aO;D<6T|1S0udjHHX~yJzoP@UElM}=Xh^}m9*(A_dQRW
z`u|F-zkBo%-0Ld#=wrCGXg6N&d!jbubrc&Dy6=(NoWJ&ZxU~o04BkEAJHW@mM^Lrn
z_@1h*9h=QL`5t;7*fDnk<5sabrrh^YZTk9t8NR!ec5V8~eb3Z)FZR&h+I(Ry-osKy
zukXS1es+6}%I*E^4(@&JXLoSl!~A>>uHK8JDfe?Y<$eyQ+|Ob8c&c!3-le%;UdQ`W
zgZuPeX+MC9tw%l<g|2v?|M)wXb*mY35c~tMHe<Zr<c{H_yU^TI?_K@fv#zCGf9E70
zi@)!EHQ>?o+9O9DHupWv(G9NsUgFHz9jwjkA>uXy8`q8=WA}h-kGPG&+Kih~*3c7d
zKD9NaZ4<Dysadysqs_e5y(w6mn)@PmPMC_`;G^laTbp~iIavR6PJOUxcTDHB1z4Nc
zbi{25jyT7X>lbxwRqAS95AvvEYp``_k9_^W=4;0uYqSkE?NP_JU~Op~{oxVkSaSWM
zjsc~v=5;8KItGHRLwnRQ2psc@IvTKPKdoGk!C-A^9Yf#|=U8(6qK=`ZuIBYGk2;2d
ztwa0AysM0%dEa^;cn09h_maJd^}gAN<~_VIz4`U~OlkiFyEt;Vf4ynGdeO|`zHdr1
z$7b{~&tprwImBs>zHnbY3q_7T3Em>Xo+FrhEBeTN99aM0v%$u!rvAO?kEdx1y%B7l
z(C2`)J<M#K(_ET4kEf|wv$nKmpJ#ogHE#`%`uio=XI*RVPaicb0P7!oA=sGUO<?1q
z#uLEW9%PrS@kE+AjPt!s-8!{Lo$g%&&6m%UvBv`vJTSq7z}7jK-a3qLrs*HN1#C=k
zem3Sa#m}$$ivJhEU1-B;`l^T43J%S@`dYL0wB})OUun%l;8Fkf2_6cz<{jvxrsZJ$
zgRcM^6Z}hH<D#yWVC&MRuX<>!z@eE}Uu)K$*1RL!S9&kG2G+I{eSH3%1ol4jUKRW7
z<Gt!ypG@<f)E>2-0yfXtrOz6$e%h@=p4Kq}?klZhXSj9jLLYUU0k)3dXO%q0JsobG
zb?Up87ImHpwodKp#-{T)yVSM!q!Q<L4$XBmr@76cpL0-4=ddf>S2~B0aOdEU7E$xL
z;FyE&JLe}{`(-7!Mz!;3&cXZV>oniHw8xw;06UKNoIV%Qw8eY#MPTE!>*Kt>LDTLS
z)+3L67lXA0|7OWgDPu2zYYY8bV8=hB-0N?H_0evA=W{77_Th4{`=I^QGUf`facYt0
zJ78;3bG_xU-lM>yX})%+#rxMDVAp$3de__ay^5Bu_thD{Cgay;{JM-^pYa<qeq+XO
zF1hRF{(PTiovw|w$;H;yUSeyyk~WsgjrjqsFHL{_)Fa;y!I96}<>s?)bBL`u&G#d?
z`SjOMT|eid7P)T%M{egO*Uve%)1tl~gZ*qVUO)ACX8Q@)`t@-i<*|=r!TZsC?L%`P
z$I$Ofb07Dox0YM!e?hyg<a^WqlBVB0^jqj}r|C1BH8bxWrA@!Cv_-Td%Iob{V0B+%
zyA!O<`#o%Tl{VwUc6Vv3p`THXwGM1fa~PwKb6rOtb2Y~Vny&+B@wsqdg2yFzJlHju
zNS{6fnP&>k*Cd*G4yK=6;(Gc+XvQ8&?|pDT{R1>L*Rvb_Z)p0(eSL3f>xruVTbgUJ
z9Zk)B*I#?&co3{D_(LUk@9zUYOw&iZ@z$U%{C@{FKHj^257rjXo__#4uSaOc%k?qd
z{C}jG)0%tIKT5M^uO+p}^Cz%*+G*yI>l^x?!Rn9E)a8yD`r}~rD{1O-^?YAuV)HeX
z7W+Fb!G|Y!dWjFCpHX7>@d%oGH;dlA^L}`OR$uaG=$}ov`uc>cznF0KR}!xNTEf*|
zPq_LU30Hq7;p*=uT>Zm@TjwVk@4}`;{r0sP_dGn+dt}^m@l^M0JmsE`r`$90lzUE|
z^8Ok3ygbzhXMAYJJv&eRJwH#m=jSQ+{5<8JpQqgO^OSpjo^sF6Q$9Z9o}Z_>=jSQ+
z{5<8JpQqgO^OSpjo^sF6Q||eB%8$?Zyo{faanI4yc+b&O?m2qOJxfpd>WrV3@wFNE
zJUxx~JU!)}si)jC^^|+2p7JX)-k$MmGJbu=Jy%cjd9I#v&(%}zxq8YyS5LX;>M38B
z@q03UZ^k`WPvbpTPx(U`_gp>IJy%b;=jti<Ts`HUtEc?AjC-!0>gzN9QpP=3PyIbt
zPx)&Z_gp>IJy%b;=jti<Ts`HUtEb#^^^|{<anIFL-E;Mnd#;}HZW-^ManIFLf6vuZ
z?zwu(Jy#EYKkvU=C0x7b>Y>{Y%(&<3sXi>@o~x(2=jtgRmGM0@J~rc?tEcgvtEb#^
z^^|+Ap7J9zer(2%%ed$2X}ss^Dfe7G<!u>XmGP4^z9!?ItEc%qS5Ntw8TVX0)i2Dr
z=jy3`NyaZNxzA0b*xx5<K6Cio<+JotrOjs>pEczAsy#>3=5LPjzm&EY==EP;+Wda!
zbN63qKGUzKP3PP2)AWC%X}9^}cHjx88t=1=dW`pXa5~=0aBVNq9Pbr+Z86@fV8_!Q
z<NX7y-R3JD&u1g`81J88$1}IjLGsA`FR;0_NA7=vwcC89xqYrukKAv9)4BWyuI+7_
z<M~{rEynvV*zvT-c>e=yxA{uP^EpjD#(NK(j`smv+xs-f^EpjhjQ0`P@wCTyAA_~q
ze5K>P3s?8KxSsw~n)~Oour+j}H$c1N`5Y(@|E^&D+iBrngH5~sJ}1h<zYeUw&*kCY
z4V!lTeU6lee|NC^o7S@tHtqV?)62tuW3c}1N&cSLwCmr4ULO9v!1|~4Y>G|0{ywM5
z!+$fd*F;*+=Ge6B?{lm?{I>w>-%gA9Zi!91{yyi*!+$HV{%Jj1W7Dp`&%yHW-v;b8
zlh(5>HtqWRoGcIj?ZEoC)1saM*tF~4pI#pRgTVTy^)z79uD{RO^6(!5_L@rT*&dsA
z{Rh*_!+#i9|8`o`vjaBm`VXa-hyRXX{nL7O!lqsS;q>yzH3IB4tL8JkJhYKuuTM3f
z@8zNG2KHK1^I2c+ZzI0PjRH@gH=l9p5w|;7yK#;y4{Z;y{%VdZ4{bEqJZg?B4{a~7
z*SXrB^zzWgfW4;G_NJGIwh!3rSIzGq@|eTEVCP^y<J2Q=Kd^S=99JIN{$Typ99JIN
z0bui}RmM%UabWKaHNX4FBW?oN`#^0xy*#voz+Ure6Y1qKhl9b+!F<N4N8BW^cH<mZ
z9@=EE{%VdZ5A6`JdDI+N9@?Q`?<ch>^zzWAg1vXt4x^Wcb~xDkL~R<qJmxSR>>SKz
zoO;B~0BbkSapj>M0oGs5apj@S1e-_Aapj>M2_9T(v*_ia9S!z=GwvvQd1#*jdvB>7
zLobgxd=~5+%x9c>#C;B|-8jdUhjuJje>KOIhxU1}dDI+N9@=r>xHpdn&!+kL=Vw(T
zxCg!V_}zI9*ci3QF&7-~3-iEfj`?tNXpbBVz{aRWj)mYZv{TEm7lHND9=0a1<EuHB
zFVJg?94CS+_X^r#a9YO_xOHfcI-0@8s6~zza6QfZOX<@bUxb@Od*o;Z8>1FEmVv#W
zqwY4ae%iyf9PIdNj&%aPw#e}%aNMga!D$_<;MSo%>i9C)7`4dp6>x8w`B&4YIZlF`
zLwn>n8ElMN<TwS~hZc3O0qdtdY^Q=9U(K;rsL>+F>EMB-ww6Av;|#cUXpcJ11RJ9k
zInDxm-<tny`ZUKmaC2yn9A5<+qZT>72JTOby3Ymcr#)=vfgNAXu}-7c7CF8S9$sn}
z(5H1=2)7RHQO8AKW7HzYH^AP%=D(Oe&GAjRIkZQPOTfmcMUHQQhti_%Z-e#I9=1!t
zj<4oe=hJHwUqOF4E$;R2fE`zRjPYHt_gK{NJ+OY-&CyPu=C~4W4(*ZSDzMjf<oG^V
zKkZTH)nIe0nfDs{H17}K=G7i~uLZj|k@tsS{j^8k>%it#Gw=2EY2F{f&8t1~-T-!;
zBJYi0{j^8ko51E)i@2M?UiYV$?_WO#>!;nZFQeBM{yzb0i}Qq^g89d{grZOjzgxiW
zfjNFgFAweKVAoddR(g48zW`g0+HLf5KVy7f|0TGA-h9TX>vuc7TIBc@*fmzWgI*rm
zU0~Nq?M`}m<i8u-Kz{QXryg;?2FHBXf#sq71|0Lb2P_Zmx8Ru1y<mBacOQ5d<C)Jm
zb?dsHUM+Gw2#)zY0G5aLFgWJ(5Lh1he+M?d`HWMKef~Yz`Re1CkI-w2`|1ziU5V8m
z*VG@u?zfsTkJ4+4m_LC>5Tjk)>qMLQ&-9Pc!uN5o*Rb}O%M)N@V!S88`e`@EQ}nBe
zi5ySE&7nQ6@n^ussYMOiVvOg(@tOY|SRUG6!1XlaUZ9tU_E&JfQd>_ik9GbV*mX9a
zaq1EGB6u(@{)Xf4VE$G9zQZ_m$9tAuE#h7QcPX`(>E)sQ1Kgw3UZt0Z_ByzKsl7%o
zk2?PewodaIryg<t0uQA{o&N^&uTrOR>el%Zy;{V*1+Fc%H|gb}{Rg~Bsl81v5ADC;
zfu;5iy*%ptAJ{t0XPkP(y$c>ri#p!}^RH5;aq8Cj2EAIueF%<y{{Sox?PGB4`$u4T
zXrF>(-#-D%qfWmFHZZpNj8l)eE@<h#cZDNY{(i<db?bZ|qguq(sbN#|_hWfz_2Ag|
z%5TYuwh=h?-QShvQD+Zu17n-dIQ7W6F*x1#p75yCICX3Bw{5kE+Y}u8?(fv{&^80d
zzWbZCJhVRG*mr-wmPegifUVPf#;Hf#mf&>X`@*A6<J7Iw-_X^>{vO_s7CE*Bd!GgG
z5AO|+YiT>UIfv5pRX2~nn-8GH`_3S+?>pKfR|D9+jQ6g=aQ(EK!{6GqMUL&kJCZ|t
z#0&*ngPJk^ey=UY*a7VQ7koIpPg4Jm@Tgy3b@TW;ep>&|aO>9|xkiB9<EUj9uzuRj
z;cx%iqW)dM)~`KccGE^vGsg1-Z865~VDHP|d%*i9^^b-}{rak#$8&|W{=MMVuRU_@
z4R-INmN8)cw41}T2yIdSK49zD9x?lZtwGHg&o{J<=h?{LMfV4LO{KqU9RSxJbsPxx
zItV@vo?c_);m&b5O<(o+Or8KXpFWO1kv^UGL2&1-J@Ow6_L`14OoHpDJ?fqew(fRf
zBkmBm_W0bK0yb8=V;@SdE!OKWu<NBg#+nMYW;J6x-_aK19uCgeYdSXVQO69h*In=<
z!09zT6CUfOuX?Q4EU@|Xar`6c)Ac$E?s{pD{6~Ymhhh%L!1dD}b$<q&uh(a>X^-{#
z9N1Xxj(se>wpg#vgIzD}G1eEr)~sgCG<t0@?rd<rUdLn89`!eZz0QNr0jKxSTzIUP
zzUr}F^T6iQ$MNUWr|Y!<?s{pD{0qU}voVK7aQ(DL-A&+py-vWUJ=W_)u(8@5dojJX
zSg$2u*Gqeh)eN>~HDivW*A}%c1#d%(zjgW|SX=0=V6Vw|e_sZUwb57Iye;%<5w{#X
zj23xUfVGAGC9r!Jc~^oXufFQ$ZKGF<xG#f8(IW3xz}iAz4R(%^_at!S)mPoTtLW9l
zYv@m*#X6n}b`P~j-qXR(QUBBE<)NJcwobLR^zzWo0y~!4ne_7btUepuz&PeJPF=rq
z<k+L0ufnZId(7)=VCUuMta<ba?c7rH{n5GT>vgsX{5-IKM-}hCJ;CQ=(;j~V`*pCk
zxP~qOn^U{>olLJyd=Y(m{eJ^)t=eM^E(RMD`MwF(PrErTp-;b)d<$+4?Q!jY8*H4K
zF_+S7i#56o>~}5g>KD>$iyAHmTSM?Gz&^`HP2Yj*qdk1S3r=Ic2lp8_V%p*QXpcIt
z1e;4O@>~T@^L!sZILUK0Tp#U`=NhoN)FRIhz-gXq;XYSKJwJr&qdoFm2R4^l<hdT~
z=V|!-2(F*@@VNo(8b<CL!TM=;eW$T@+QR=Pa9Z2VaG${={>N~Av`5TOz-i1+;Xa#3
z%+KKZXpdTN0h>!L^4tne^ZXp{^LXUB4X%&&$ny)Zxzr-hFTrV^+u?p6h&*?|_0b-A
zeg!s{TI9JC?B{Rz+y&Q9d-&W9PS<xGTtDru@8PVkw($QoIIZm-cx_VKZ{Yf9kC=PG
zY0Pioey51}+y~c3d(?VA*j#Fn=K*k<=RvsNBO=d3aDB8#o`=EaQj0vl1E+a@5BGaa
z<aq?HkM_v(2e7%+BF`VeeqM*qqj3GShtHqD>H0nf*H63aJDv5_mj9jV(b#<*N%QYi
z{TtMyX#U;lG4%f3sekABII%v@%I%-Z_|qAGCgJ9MHsJ@upD(%f`*$eM!5!CH^_8m~
zgzW|JlQcEs<YW1NdeIes;XnSZt@T)g;~KADtj&6``uOsh{I6i=<7a|7<@$vG-@xVy
z{};jfx0n8MeZv3mV8;#rm%#da)@**cKKj2*{}0-$C3o(xft`E&4bJOuZTkB+Gyepe
tFaGZQUtn#%!uIdd#%0oBd!w}J8h@YuCd7FC)%_fr%Q)8Lxax6@{2w`Cd9DBe

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl
new file mode 100644
index 0000000000..f197c64ca1
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl
@@ -0,0 +1,1177 @@
+/*============================================================================
+
+
+                    NVIDIA FXAA 3.11 by TIMOTHY LOTTES
+
+
+------------------------------------------------------------------------------
+COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED.
+------------------------------------------------------------------------------
+TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED
+*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA
+OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR
+CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR
+LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION,
+OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE
+THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+------------------------------------------------------------------------------
+                           INTEGRATION CHECKLIST
+------------------------------------------------------------------------------
+(1.)
+In the shader source, setup defines for the desired configuration.
+When providing multiple shaders (for different presets),
+simply setup the defines differently in multiple files.
+Example,
+
+  #define FXAA_PC 1
+  #define FXAA_HLSL_5 1
+  #define FXAA_QUALITY_PRESET 12
+
+Or,
+
+  #define FXAA_360 1
+  
+Or,
+
+  #define FXAA_PS3 1
+  
+Etc.
+
+(2.)
+Then include this file,
+
+  #include "Fxaa3_11.h"
+
+(3.)
+Then call the FXAA pixel shader from within your desired shader.
+Look at the FXAA Quality FxaaPixelShader() for docs on inputs.
+As for FXAA 3.11 all inputs for all shaders are the same 
+to enable easy porting between platforms.
+
+  return FxaaPixelShader(...);
+
+(4.)
+Insure pass prior to FXAA outputs RGBL (see next section).
+Or use,
+
+  #define FXAA_GREEN_AS_LUMA 1
+
+(5.)
+Setup engine to provide the following constants
+which are used in the FxaaPixelShader() inputs,
+
+  FxaaFloat2 fxaaQualityRcpFrame,
+  FxaaFloat4 fxaaConsoleRcpFrameOpt,
+  FxaaFloat4 fxaaConsoleRcpFrameOpt2,
+  FxaaFloat4 fxaaConsole360RcpFrameOpt2,
+  FxaaFloat fxaaQualitySubpix,
+  FxaaFloat fxaaQualityEdgeThreshold,
+  FxaaFloat fxaaQualityEdgeThresholdMin,
+  FxaaFloat fxaaConsoleEdgeSharpness,
+  FxaaFloat fxaaConsoleEdgeThreshold,
+  FxaaFloat fxaaConsoleEdgeThresholdMin,
+  FxaaFloat4 fxaaConsole360ConstDir
+
+Look at the FXAA Quality FxaaPixelShader() for docs on inputs.
+
+(6.)
+Have FXAA vertex shader run as a full screen triangle,
+and output "pos" and "fxaaConsolePosPos" 
+such that inputs in the pixel shader provide,
+
+  // {xy} = center of pixel
+  FxaaFloat2 pos,
+
+  // {xy_} = upper left of pixel
+  // {_zw} = lower right of pixel
+  FxaaFloat4 fxaaConsolePosPos,
+
+(7.)
+Insure the texture sampler(s) used by FXAA are set to bilinear filtering.
+
+
+------------------------------------------------------------------------------
+                    INTEGRATION - RGBL AND COLORSPACE
+------------------------------------------------------------------------------
+FXAA3 requires RGBL as input unless the following is set, 
+
+  #define FXAA_GREEN_AS_LUMA 1
+
+In which case the engine uses green in place of luma,
+and requires RGB input is in a non-linear colorspace.
+
+RGB should be LDR (low dynamic range).
+Specifically do FXAA after tonemapping.
+
+RGB data as returned by a texture fetch can be non-linear,
+or linear when FXAA_GREEN_AS_LUMA is not set.
+Note an "sRGB format" texture counts as linear,
+because the result of a texture fetch is linear data.
+Regular "RGBA8" textures in the sRGB colorspace are non-linear.
+
+If FXAA_GREEN_AS_LUMA is not set,
+luma must be stored in the alpha channel prior to running FXAA.
+This luma should be in a perceptual space (could be gamma 2.0).
+Example pass before FXAA where output is gamma 2.0 encoded,
+
+  color.rgb = ToneMap(color.rgb); // linear color output
+  color.rgb = sqrt(color.rgb);    // gamma 2.0 color output
+  return color;
+
+To use FXAA,
+
+  color.rgb = ToneMap(color.rgb);  // linear color output
+  color.rgb = sqrt(color.rgb);     // gamma 2.0 color output
+  color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma
+  return color;
+
+Another example where output is linear encoded,
+say for instance writing to an sRGB formated render target,
+where the render target does the conversion back to sRGB after blending,
+
+  color.rgb = ToneMap(color.rgb); // linear color output
+  return color;
+
+To use FXAA,
+
+  color.rgb = ToneMap(color.rgb); // linear color output
+  color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma
+  return color;
+
+Getting luma correct is required for the algorithm to work correctly.
+
+
+------------------------------------------------------------------------------
+                          BEING LINEARLY CORRECT?
+------------------------------------------------------------------------------
+Applying FXAA to a framebuffer with linear RGB color will look worse.
+This is very counter intuitive, but happends to be true in this case.
+The reason is because dithering artifacts will be more visiable 
+in a linear colorspace.
+
+
+------------------------------------------------------------------------------
+                             COMPLEX INTEGRATION
+------------------------------------------------------------------------------
+Q. What if the engine is blending into RGB before wanting to run FXAA?
+
+A. In the last opaque pass prior to FXAA,
+   have the pass write out luma into alpha.
+   Then blend into RGB only.
+   FXAA should be able to run ok
+   assuming the blending pass did not any add aliasing.
+   This should be the common case for particles and common blending passes.
+
+A. Or use FXAA_GREEN_AS_LUMA.
+
+============================================================================*/
+
+#version 430 core
+
+layout(local_size_x = 16, local_size_y = 16) in;
+layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput;
+
+layout(binding = 1, set = 2) uniform sampler2D inputImage;
+layout(binding = 2) uniform invResolution
+{
+    vec2 invResolution_data;
+};
+
+#define FXAA_QUALITY_PRESET 12
+#define FXAA_GREEN_AS_LUMA 1
+#define FXAA_PC 1
+#define FXAA_GLSL_130 1
+
+
+/*============================================================================
+
+                             INTEGRATION KNOBS
+
+/*==========================================================================*/
+#ifndef FXAA_PC
+    //
+    // FXAA Quality
+    // The high quality PC algorithm.
+    //
+    #define FXAA_PC 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_GLSL_120
+    #define FXAA_GLSL_120 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_GLSL_130
+    #define FXAA_GLSL_130 0
+#endif
+/*==========================================================================*/
+#ifndef FXAA_GREEN_AS_LUMA
+    //
+    // For those using non-linear color,
+    // and either not able to get luma in alpha, or not wanting to,
+    // this enables FXAA to run using green as a proxy for luma.
+    // So with this enabled, no need to pack luma in alpha.
+    //
+    // This will turn off AA on anything which lacks some amount of green.
+    // Pure red and blue or combination of only R and B, will get no AA.
+    //
+    // Might want to lower the settings for both,
+    //    fxaaConsoleEdgeThresholdMin
+    //    fxaaQualityEdgeThresholdMin
+    // In order to insure AA does not get turned off on colors 
+    // which contain a minor amount of green.
+    //
+    // 1 = On.
+    // 0 = Off.
+    //
+    #define FXAA_GREEN_AS_LUMA 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_EARLY_EXIT
+    //
+    // Controls algorithm's early exit path.
+    // On PS3 turning this ON adds 2 cycles to the shader.
+    // On 360 turning this OFF adds 10ths of a millisecond to the shader.
+    // Turning this off on console will result in a more blurry image.
+    // So this defaults to on.
+    //
+    // 1 = On.
+    // 0 = Off.
+    //
+    #define FXAA_EARLY_EXIT 1
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_DISCARD
+    //
+    // Only valid for PC OpenGL currently.
+    // Probably will not work when FXAA_GREEN_AS_LUMA = 1.
+    //
+    // 1 = Use discard on pixels which don't need AA.
+    //     For APIs which enable concurrent TEX+ROP from same surface.
+    // 0 = Return unchanged color on pixels which don't need AA.
+    //
+    #define FXAA_DISCARD 0
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_FAST_PIXEL_OFFSET
+    //
+    // Used for GLSL 120 only.
+    //
+    // 1 = GL API supports fast pixel offsets
+    // 0 = do not use fast pixel offsets
+    //
+    #ifdef GL_EXT_gpu_shader4
+        #define FXAA_FAST_PIXEL_OFFSET 1
+    #endif
+    #ifdef GL_NV_gpu_shader5
+        #define FXAA_FAST_PIXEL_OFFSET 1
+    #endif
+    #ifdef GL_ARB_gpu_shader5
+        #define FXAA_FAST_PIXEL_OFFSET 1
+    #endif
+    #ifndef FXAA_FAST_PIXEL_OFFSET
+        #define FXAA_FAST_PIXEL_OFFSET 0
+    #endif
+#endif
+/*--------------------------------------------------------------------------*/
+#ifndef FXAA_GATHER4_ALPHA
+    //
+    // 1 = API supports gather4 on alpha channel.
+    // 0 = API does not support gather4 on alpha channel.
+    //
+    #if (FXAA_HLSL_5 == 1)
+        #define FXAA_GATHER4_ALPHA 1
+    #endif
+    #ifdef GL_ARB_gpu_shader5
+        #define FXAA_GATHER4_ALPHA 1
+    #endif
+    #ifdef GL_NV_gpu_shader5
+        #define FXAA_GATHER4_ALPHA 1
+    #endif
+    #ifndef FXAA_GATHER4_ALPHA
+        #define FXAA_GATHER4_ALPHA 0
+    #endif
+#endif
+
+/*============================================================================
+                        FXAA QUALITY - TUNING KNOBS
+------------------------------------------------------------------------------
+NOTE the other tuning knobs are now in the shader function inputs!
+============================================================================*/
+#ifndef FXAA_QUALITY_PRESET
+    //
+    // Choose the quality preset.
+    // This needs to be compiled into the shader as it effects code.
+    // Best option to include multiple presets is to 
+    // in each shader define the preset, then include this file.
+    // 
+    // OPTIONS
+    // -----------------------------------------------------------------------
+    // 10 to 15 - default medium dither (10=fastest, 15=highest quality)
+    // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality)
+    // 39       - no dither, very expensive 
+    //
+    // NOTES
+    // -----------------------------------------------------------------------
+    // 12 = slightly faster then FXAA 3.9 and higher edge quality (default)
+    // 13 = about same speed as FXAA 3.9 and better than 12
+    // 23 = closest to FXAA 3.9 visually and performance wise
+    //  _ = the lowest digit is directly related to performance
+    // _  = the highest digit is directly related to style
+    // 
+    #define FXAA_QUALITY_PRESET 12
+#endif
+
+
+/*============================================================================
+
+                           FXAA QUALITY - PRESETS
+
+============================================================================*/
+
+/*============================================================================
+                     FXAA QUALITY - MEDIUM DITHER PRESETS
+============================================================================*/
+#if (FXAA_QUALITY_PRESET == 10)
+    #define FXAA_QUALITY_PS 3
+    #define FXAA_QUALITY_P0 1.5
+    #define FXAA_QUALITY_P1 3.0
+    #define FXAA_QUALITY_P2 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 11)
+    #define FXAA_QUALITY_PS 4
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 3.0
+    #define FXAA_QUALITY_P3 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 12)
+    #define FXAA_QUALITY_PS 5
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 4.0
+    #define FXAA_QUALITY_P4 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 13)
+    #define FXAA_QUALITY_PS 6
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 4.0
+    #define FXAA_QUALITY_P5 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 14)
+    #define FXAA_QUALITY_PS 7
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 4.0
+    #define FXAA_QUALITY_P6 12.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 15)
+    #define FXAA_QUALITY_PS 8
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 2.0
+    #define FXAA_QUALITY_P6 4.0
+    #define FXAA_QUALITY_P7 12.0
+#endif
+
+/*============================================================================
+                     FXAA QUALITY - LOW DITHER PRESETS
+============================================================================*/
+#if (FXAA_QUALITY_PRESET == 20)
+    #define FXAA_QUALITY_PS 3
+    #define FXAA_QUALITY_P0 1.5
+    #define FXAA_QUALITY_P1 2.0
+    #define FXAA_QUALITY_P2 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 21)
+    #define FXAA_QUALITY_PS 4
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 22)
+    #define FXAA_QUALITY_PS 5
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 23)
+    #define FXAA_QUALITY_PS 6
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 24)
+    #define FXAA_QUALITY_PS 7
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 3.0
+    #define FXAA_QUALITY_P6 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 25)
+    #define FXAA_QUALITY_PS 8
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 2.0
+    #define FXAA_QUALITY_P6 4.0
+    #define FXAA_QUALITY_P7 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 26)
+    #define FXAA_QUALITY_PS 9
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 2.0
+    #define FXAA_QUALITY_P6 2.0
+    #define FXAA_QUALITY_P7 4.0
+    #define FXAA_QUALITY_P8 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 27)
+    #define FXAA_QUALITY_PS 10
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 2.0
+    #define FXAA_QUALITY_P6 2.0
+    #define FXAA_QUALITY_P7 2.0
+    #define FXAA_QUALITY_P8 4.0
+    #define FXAA_QUALITY_P9 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 28)
+    #define FXAA_QUALITY_PS 11
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 2.0
+    #define FXAA_QUALITY_P6 2.0
+    #define FXAA_QUALITY_P7 2.0
+    #define FXAA_QUALITY_P8 2.0
+    #define FXAA_QUALITY_P9 4.0
+    #define FXAA_QUALITY_P10 8.0
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_QUALITY_PRESET == 29)
+    #define FXAA_QUALITY_PS 12
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.5
+    #define FXAA_QUALITY_P2 2.0
+    #define FXAA_QUALITY_P3 2.0
+    #define FXAA_QUALITY_P4 2.0
+    #define FXAA_QUALITY_P5 2.0
+    #define FXAA_QUALITY_P6 2.0
+    #define FXAA_QUALITY_P7 2.0
+    #define FXAA_QUALITY_P8 2.0
+    #define FXAA_QUALITY_P9 2.0
+    #define FXAA_QUALITY_P10 4.0
+    #define FXAA_QUALITY_P11 8.0
+#endif
+
+/*============================================================================
+                     FXAA QUALITY - EXTREME QUALITY
+============================================================================*/
+#if (FXAA_QUALITY_PRESET == 39)
+    #define FXAA_QUALITY_PS 12
+    #define FXAA_QUALITY_P0 1.0
+    #define FXAA_QUALITY_P1 1.0
+    #define FXAA_QUALITY_P2 1.0
+    #define FXAA_QUALITY_P3 1.0
+    #define FXAA_QUALITY_P4 1.0
+    #define FXAA_QUALITY_P5 1.5
+    #define FXAA_QUALITY_P6 2.0
+    #define FXAA_QUALITY_P7 2.0
+    #define FXAA_QUALITY_P8 2.0
+    #define FXAA_QUALITY_P9 2.0
+    #define FXAA_QUALITY_P10 4.0
+    #define FXAA_QUALITY_P11 8.0
+#endif
+
+
+
+/*============================================================================
+
+                                API PORTING
+
+============================================================================*/
+#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1)
+    #define FxaaBool bool
+    #define FxaaDiscard discard
+    #define FxaaFloat float
+    #define FxaaFloat2 vec2
+    #define FxaaFloat3 vec3
+    #define FxaaFloat4 vec4
+    #define FxaaHalf float
+    #define FxaaHalf2 vec2
+    #define FxaaHalf3 vec3
+    #define FxaaHalf4 vec4
+    #define FxaaInt2 ivec2
+    #define FxaaSat(x) clamp(x, 0.0, 1.0)
+    #define FxaaTex sampler2D
+#else
+    #define FxaaBool bool
+    #define FxaaDiscard clip(-1)
+    #define FxaaFloat float
+    #define FxaaFloat2 float2
+    #define FxaaFloat3 float3
+    #define FxaaFloat4 float4
+    #define FxaaHalf half
+    #define FxaaHalf2 half2
+    #define FxaaHalf3 half3
+    #define FxaaHalf4 half4
+    #define FxaaSat(x) saturate(x)
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_GLSL_120 == 1)
+    // Requires,
+    //  #version 120
+    // And at least,
+    //  #extension GL_EXT_gpu_shader4 : enable
+    //  (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9)
+    #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0)
+    #if (FXAA_FAST_PIXEL_OFFSET == 1)
+        #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o)
+    #else
+        #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0)
+    #endif
+    #if (FXAA_GATHER4_ALPHA == 1)
+        // use #extension GL_ARB_gpu_shader5 : enable
+        #define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
+        #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
+        #define FxaaTexGreen4(t, p) textureGather(t, p, 1)
+        #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1)
+    #endif
+#endif
+/*--------------------------------------------------------------------------*/
+#if (FXAA_GLSL_130 == 1)
+    // Requires "#version 130" or better
+    #define FxaaTexTop(t, p) textureLod(t, p, 0.0)
+    #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o)
+    #if (FXAA_GATHER4_ALPHA == 1)
+        // use #extension GL_ARB_gpu_shader5 : enable
+        #define FxaaTexAlpha4(t, p) textureGather(t, p, 3)
+        #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3)
+        #define FxaaTexGreen4(t, p) textureGather(t, p, 1)
+        #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1)
+    #endif
+#endif
+
+
+/*============================================================================
+                   GREEN AS LUMA OPTION SUPPORT FUNCTION
+============================================================================*/
+#if (FXAA_GREEN_AS_LUMA == 0)
+    FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; }
+#else
+    FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; }
+#endif    
+
+
+
+
+/*============================================================================
+
+                             FXAA3 QUALITY - PC
+
+============================================================================*/
+#if (FXAA_PC == 1)
+/*--------------------------------------------------------------------------*/
+FxaaFloat4 FxaaPixelShader(
+    //
+    // Use noperspective interpolation here (turn off perspective interpolation).
+    // {xy} = center of pixel
+    FxaaFloat2 pos,
+    //
+    // Used only for FXAA Console, and not used on the 360 version.
+    // Use noperspective interpolation here (turn off perspective interpolation).
+    // {xy_} = upper left of pixel
+    // {_zw} = lower right of pixel
+    FxaaFloat4 fxaaConsolePosPos,
+    //
+    // Input color texture.
+    // {rgb_} = color in linear or perceptual color space
+    // if (FXAA_GREEN_AS_LUMA == 0)
+    //     {__a} = luma in perceptual color space (not linear)
+    FxaaTex tex,
+    //
+    // Only used on the optimized 360 version of FXAA Console.
+    // For everything but 360, just use the same input here as for "tex".
+    // For 360, same texture, just alias with a 2nd sampler.
+    // This sampler needs to have an exponent bias of -1.
+    FxaaTex fxaaConsole360TexExpBiasNegOne,
+    //
+    // Only used on the optimized 360 version of FXAA Console.
+    // For everything but 360, just use the same input here as for "tex".
+    // For 360, same texture, just alias with a 3nd sampler.
+    // This sampler needs to have an exponent bias of -2.
+    FxaaTex fxaaConsole360TexExpBiasNegTwo,
+    //
+    // Only used on FXAA Quality.
+    // This must be from a constant/uniform.
+    // {x_} = 1.0/screenWidthInPixels
+    // {_y} = 1.0/screenHeightInPixels
+    FxaaFloat2 fxaaQualityRcpFrame,
+    //
+    // Only used on FXAA Console.
+    // This must be from a constant/uniform.
+    // This effects sub-pixel AA quality and inversely sharpness.
+    //   Where N ranges between,
+    //     N = 0.50 (default)
+    //     N = 0.33 (sharper)
+    // {x__} = -N/screenWidthInPixels  
+    // {_y_} = -N/screenHeightInPixels
+    // {_z_} =  N/screenWidthInPixels  
+    // {__w} =  N/screenHeightInPixels 
+    FxaaFloat4 fxaaConsoleRcpFrameOpt,
+    //
+    // Only used on FXAA Console.
+    // Not used on 360, but used on PS3 and PC.
+    // This must be from a constant/uniform.
+    // {x__} = -2.0/screenWidthInPixels  
+    // {_y_} = -2.0/screenHeightInPixels
+    // {_z_} =  2.0/screenWidthInPixels  
+    // {__w} =  2.0/screenHeightInPixels 
+    FxaaFloat4 fxaaConsoleRcpFrameOpt2,
+    //
+    // Only used on FXAA Console.
+    // Only used on 360 in place of fxaaConsoleRcpFrameOpt2.
+    // This must be from a constant/uniform.
+    // {x__} =  8.0/screenWidthInPixels  
+    // {_y_} =  8.0/screenHeightInPixels
+    // {_z_} = -4.0/screenWidthInPixels  
+    // {__w} = -4.0/screenHeightInPixels 
+    FxaaFloat4 fxaaConsole360RcpFrameOpt2,
+    //
+    // Only used on FXAA Quality.
+    // This used to be the FXAA_QUALITY_SUBPIX define.
+    // It is here now to allow easier tuning.
+    // Choose the amount of sub-pixel aliasing removal.
+    // This can effect sharpness.
+    //   1.00 - upper limit (softer)
+    //   0.75 - default amount of filtering
+    //   0.50 - lower limit (sharper, less sub-pixel aliasing removal)
+    //   0.25 - almost off
+    //   0.00 - completely off
+    FxaaFloat fxaaQualitySubpix,
+    //
+    // Only used on FXAA Quality.
+    // This used to be the FXAA_QUALITY_EDGE_THRESHOLD define.
+    // It is here now to allow easier tuning.
+    // The minimum amount of local contrast required to apply algorithm.
+    //   0.333 - too little (faster)
+    //   0.250 - low quality
+    //   0.166 - default
+    //   0.125 - high quality 
+    //   0.063 - overkill (slower)
+    FxaaFloat fxaaQualityEdgeThreshold,
+    //
+    // Only used on FXAA Quality.
+    // This used to be the FXAA_QUALITY_EDGE_THRESHOLD_MIN define.
+    // It is here now to allow easier tuning.
+    // Trims the algorithm from processing darks.
+    //   0.0833 - upper limit (default, the start of visible unfiltered edges)
+    //   0.0625 - high quality (faster)
+    //   0.0312 - visible limit (slower)
+    // Special notes when using FXAA_GREEN_AS_LUMA,
+    //   Likely want to set this to zero.
+    //   As colors that are mostly not-green
+    //   will appear very dark in the green channel!
+    //   Tune by looking at mostly non-green content,
+    //   then start at zero and increase until aliasing is a problem.
+    FxaaFloat fxaaQualityEdgeThresholdMin,
+    // 
+    // Only used on FXAA Console.
+    // This used to be the FXAA_CONSOLE_EDGE_SHARPNESS define.
+    // It is here now to allow easier tuning.
+    // This does not effect PS3, as this needs to be compiled in.
+    //   Use FXAA_CONSOLE_PS3_EDGE_SHARPNESS for PS3.
+    //   Due to the PS3 being ALU bound,
+    //   there are only three safe values here: 2 and 4 and 8.
+    //   These options use the shaders ability to a free *|/ by 2|4|8.
+    // For all other platforms can be a non-power of two.
+    //   8.0 is sharper (default!!!)
+    //   4.0 is softer
+    //   2.0 is really soft (good only for vector graphics inputs)
+    FxaaFloat fxaaConsoleEdgeSharpness,
+    //
+    // Only used on FXAA Console.
+    // This used to be the FXAA_CONSOLE_EDGE_THRESHOLD define.
+    // It is here now to allow easier tuning.
+    // This does not effect PS3, as this needs to be compiled in.
+    //   Use FXAA_CONSOLE_PS3_EDGE_THRESHOLD for PS3.
+    //   Due to the PS3 being ALU bound,
+    //   there are only two safe values here: 1/4 and 1/8.
+    //   These options use the shaders ability to a free *|/ by 2|4|8.
+    // The console setting has a different mapping than the quality setting.
+    // Other platforms can use other values.
+    //   0.125 leaves less aliasing, but is softer (default!!!)
+    //   0.25 leaves more aliasing, and is sharper
+    FxaaFloat fxaaConsoleEdgeThreshold,
+    //
+    // Only used on FXAA Console.
+    // This used to be the FXAA_CONSOLE_EDGE_THRESHOLD_MIN define.
+    // It is here now to allow easier tuning.
+    // Trims the algorithm from processing darks.
+    // The console setting has a different mapping than the quality setting.
+    // This only applies when FXAA_EARLY_EXIT is 1.
+    // This does not apply to PS3, 
+    // PS3 was simplified to avoid more shader instructions.
+    //   0.06 - faster but more aliasing in darks
+    //   0.05 - default
+    //   0.04 - slower and less aliasing in darks
+    // Special notes when using FXAA_GREEN_AS_LUMA,
+    //   Likely want to set this to zero.
+    //   As colors that are mostly not-green
+    //   will appear very dark in the green channel!
+    //   Tune by looking at mostly non-green content,
+    //   then start at zero and increase until aliasing is a problem.
+    FxaaFloat fxaaConsoleEdgeThresholdMin,
+    //    
+    // Extra constants for 360 FXAA Console only.
+    // Use zeros or anything else for other platforms.
+    // These must be in physical constant registers and NOT immedates.
+    // Immedates will result in compiler un-optimizing.
+    // {xyzw} = float4(1.0, -1.0, 0.25, -0.25)
+    FxaaFloat4 fxaaConsole360ConstDir
+) {
+/*--------------------------------------------------------------------------*/
+    FxaaFloat2 posM;
+    posM.x = pos.x;
+    posM.y = pos.y;
+    #if (FXAA_GATHER4_ALPHA == 1)
+        #if (FXAA_DISCARD == 0)
+            FxaaFloat4 rgbyM = FxaaTexTop(tex, posM);
+            #if (FXAA_GREEN_AS_LUMA == 0)
+                #define lumaM rgbyM.w
+            #else
+                #define lumaM rgbyM.y
+            #endif
+        #endif
+        #if (FXAA_GREEN_AS_LUMA == 0)
+            FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM);
+            FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1));
+        #else
+            FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM);
+            FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1));
+        #endif
+        #if (FXAA_DISCARD == 1)
+            #define lumaM luma4A.w
+        #endif
+        #define lumaE luma4A.z
+        #define lumaS luma4A.x
+        #define lumaSE luma4A.y
+        #define lumaNW luma4B.w
+        #define lumaN luma4B.z
+        #define lumaW luma4B.x
+    #else
+        FxaaFloat4 rgbyM = FxaaTexTop(tex, posM);
+        #if (FXAA_GREEN_AS_LUMA == 0)
+            #define lumaM rgbyM.w
+        #else
+            #define lumaM rgbyM.y
+        #endif
+        FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy));
+    #endif
+/*--------------------------------------------------------------------------*/
+    FxaaFloat maxSM = max(lumaS, lumaM);
+    FxaaFloat minSM = min(lumaS, lumaM);
+    FxaaFloat maxESM = max(lumaE, maxSM);
+    FxaaFloat minESM = min(lumaE, minSM);
+    FxaaFloat maxWN = max(lumaN, lumaW);
+    FxaaFloat minWN = min(lumaN, lumaW);
+    FxaaFloat rangeMax = max(maxWN, maxESM);
+    FxaaFloat rangeMin = min(minWN, minESM);
+    FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold;
+    FxaaFloat range = rangeMax - rangeMin;
+    FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled);
+    FxaaBool earlyExit = range < rangeMaxClamped;
+/*--------------------------------------------------------------------------*/
+    if(earlyExit)
+        #if (FXAA_DISCARD == 1)
+            FxaaDiscard;
+        #else
+            return rgbyM;
+        #endif
+/*--------------------------------------------------------------------------*/
+    #if (FXAA_GATHER4_ALPHA == 0)
+        FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy));
+    #else
+        FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy));
+        FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy));
+    #endif
+/*--------------------------------------------------------------------------*/
+    FxaaFloat lumaNS = lumaN + lumaS;
+    FxaaFloat lumaWE = lumaW + lumaE;
+    FxaaFloat subpixRcpRange = 1.0/range;
+    FxaaFloat subpixNSWE = lumaNS + lumaWE;
+    FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS;
+    FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat lumaNESE = lumaNE + lumaSE;
+    FxaaFloat lumaNWNE = lumaNW + lumaNE;
+    FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE;
+    FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat lumaNWSW = lumaNW + lumaSW;
+    FxaaFloat lumaSWSE = lumaSW + lumaSE;
+    FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2);
+    FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2);
+    FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW;
+    FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE;
+    FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4;
+    FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE;
+    FxaaFloat lengthSign = fxaaQualityRcpFrame.x;
+    FxaaBool horzSpan = edgeHorz >= edgeVert;
+    FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE;
+/*--------------------------------------------------------------------------*/
+    if(!horzSpan) lumaN = lumaW;
+    if(!horzSpan) lumaS = lumaE;
+    if(horzSpan) lengthSign = fxaaQualityRcpFrame.y;
+    FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat gradientN = lumaN - lumaM;
+    FxaaFloat gradientS = lumaS - lumaM;
+    FxaaFloat lumaNN = lumaN + lumaM;
+    FxaaFloat lumaSS = lumaS + lumaM;
+    FxaaBool pairN = abs(gradientN) >= abs(gradientS);
+    FxaaFloat gradient = max(abs(gradientN), abs(gradientS));
+    if(pairN) lengthSign = -lengthSign;
+    FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange);
+/*--------------------------------------------------------------------------*/
+    FxaaFloat2 posB;
+    posB.x = posM.x;
+    posB.y = posM.y;
+    FxaaFloat2 offNP;
+    offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x;
+    offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y;
+    if(!horzSpan) posB.x += lengthSign * 0.5;
+    if( horzSpan) posB.y += lengthSign * 0.5;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat2 posN;
+    posN.x = posB.x - offNP.x * FXAA_QUALITY_P0;
+    posN.y = posB.y - offNP.y * FXAA_QUALITY_P0;
+    FxaaFloat2 posP;
+    posP.x = posB.x + offNP.x * FXAA_QUALITY_P0;
+    posP.y = posB.y + offNP.y * FXAA_QUALITY_P0;
+    FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0;
+    FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN));
+    FxaaFloat subpixE = subpixC * subpixC;
+    FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP));
+/*--------------------------------------------------------------------------*/
+    if(!pairN) lumaNN = lumaSS;
+    FxaaFloat gradientScaled = gradient * 1.0/4.0;
+    FxaaFloat lumaMM = lumaM - lumaNN * 0.5;
+    FxaaFloat subpixF = subpixD * subpixE;
+    FxaaBool lumaMLTZero = lumaMM < 0.0;
+/*--------------------------------------------------------------------------*/
+    lumaEndN -= lumaNN * 0.5;
+    lumaEndP -= lumaNN * 0.5;
+    FxaaBool doneN = abs(lumaEndN) >= gradientScaled;
+    FxaaBool doneP = abs(lumaEndP) >= gradientScaled;
+    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P1;
+    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P1;
+    FxaaBool doneNP = (!doneN) || (!doneP);
+    if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P1;
+    if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P1;
+/*--------------------------------------------------------------------------*/
+    if(doneNP) {
+        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+        doneN = abs(lumaEndN) >= gradientScaled;
+        doneP = abs(lumaEndP) >= gradientScaled;
+        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P2;
+        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P2;
+        doneNP = (!doneN) || (!doneP);
+        if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P2;
+        if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P2;
+/*--------------------------------------------------------------------------*/
+        #if (FXAA_QUALITY_PS > 3)
+        if(doneNP) {
+            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+            doneN = abs(lumaEndN) >= gradientScaled;
+            doneP = abs(lumaEndP) >= gradientScaled;
+            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P3;
+            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P3;
+            doneNP = (!doneN) || (!doneP);
+            if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P3;
+            if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P3;
+/*--------------------------------------------------------------------------*/
+            #if (FXAA_QUALITY_PS > 4)
+            if(doneNP) {
+                if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                doneN = abs(lumaEndN) >= gradientScaled;
+                doneP = abs(lumaEndP) >= gradientScaled;
+                if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P4;
+                if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P4;
+                doneNP = (!doneN) || (!doneP);
+                if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P4;
+                if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P4;
+/*--------------------------------------------------------------------------*/
+                #if (FXAA_QUALITY_PS > 5)
+                if(doneNP) {
+                    if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                    if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                    if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                    if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                    doneN = abs(lumaEndN) >= gradientScaled;
+                    doneP = abs(lumaEndP) >= gradientScaled;
+                    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P5;
+                    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P5;
+                    doneNP = (!doneN) || (!doneP);
+                    if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P5;
+                    if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P5;
+/*--------------------------------------------------------------------------*/
+                    #if (FXAA_QUALITY_PS > 6)
+                    if(doneNP) {
+                        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                        doneN = abs(lumaEndN) >= gradientScaled;
+                        doneP = abs(lumaEndP) >= gradientScaled;
+                        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P6;
+                        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P6;
+                        doneNP = (!doneN) || (!doneP);
+                        if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P6;
+                        if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P6;
+/*--------------------------------------------------------------------------*/
+                        #if (FXAA_QUALITY_PS > 7)
+                        if(doneNP) {
+                            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                            doneN = abs(lumaEndN) >= gradientScaled;
+                            doneP = abs(lumaEndP) >= gradientScaled;
+                            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P7;
+                            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P7;
+                            doneNP = (!doneN) || (!doneP);
+                            if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P7;
+                            if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P7;
+/*--------------------------------------------------------------------------*/
+    #if (FXAA_QUALITY_PS > 8)
+    if(doneNP) {
+        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+        doneN = abs(lumaEndN) >= gradientScaled;
+        doneP = abs(lumaEndP) >= gradientScaled;
+        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P8;
+        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P8;
+        doneNP = (!doneN) || (!doneP);
+        if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P8;
+        if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P8;
+/*--------------------------------------------------------------------------*/
+        #if (FXAA_QUALITY_PS > 9)
+        if(doneNP) {
+            if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+            if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+            if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+            if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+            doneN = abs(lumaEndN) >= gradientScaled;
+            doneP = abs(lumaEndP) >= gradientScaled;
+            if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P9;
+            if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P9;
+            doneNP = (!doneN) || (!doneP);
+            if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P9;
+            if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P9;
+/*--------------------------------------------------------------------------*/
+            #if (FXAA_QUALITY_PS > 10)
+            if(doneNP) {
+                if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                doneN = abs(lumaEndN) >= gradientScaled;
+                doneP = abs(lumaEndP) >= gradientScaled;
+                if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P10;
+                if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P10;
+                doneNP = (!doneN) || (!doneP);
+                if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P10;
+                if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P10;
+/*--------------------------------------------------------------------------*/
+                #if (FXAA_QUALITY_PS > 11)
+                if(doneNP) {
+                    if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                    if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                    if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                    if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                    doneN = abs(lumaEndN) >= gradientScaled;
+                    doneP = abs(lumaEndP) >= gradientScaled;
+                    if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P11;
+                    if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P11;
+                    doneNP = (!doneN) || (!doneP);
+                    if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P11;
+                    if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P11;
+/*--------------------------------------------------------------------------*/
+                    #if (FXAA_QUALITY_PS > 12)
+                    if(doneNP) {
+                        if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy));
+                        if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy));
+                        if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5;
+                        if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5;
+                        doneN = abs(lumaEndN) >= gradientScaled;
+                        doneP = abs(lumaEndP) >= gradientScaled;
+                        if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P12;
+                        if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P12;
+                        doneNP = (!doneN) || (!doneP);
+                        if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P12;
+                        if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P12;
+/*--------------------------------------------------------------------------*/
+                    }
+                    #endif
+/*--------------------------------------------------------------------------*/
+                }
+                #endif
+/*--------------------------------------------------------------------------*/
+            }
+            #endif
+/*--------------------------------------------------------------------------*/
+        }
+        #endif
+/*--------------------------------------------------------------------------*/
+    }
+    #endif
+/*--------------------------------------------------------------------------*/
+                        }
+                        #endif
+/*--------------------------------------------------------------------------*/
+                    }
+                    #endif
+/*--------------------------------------------------------------------------*/
+                }
+                #endif
+/*--------------------------------------------------------------------------*/
+            }
+            #endif
+/*--------------------------------------------------------------------------*/
+        }
+        #endif
+/*--------------------------------------------------------------------------*/
+    }
+/*--------------------------------------------------------------------------*/
+    FxaaFloat dstN = posM.x - posN.x;
+    FxaaFloat dstP = posP.x - posM.x;
+    if(!horzSpan) dstN = posM.y - posN.y;
+    if(!horzSpan) dstP = posP.y - posM.y;
+/*--------------------------------------------------------------------------*/
+    FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero;
+    FxaaFloat spanLength = (dstP + dstN);
+    FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero;
+    FxaaFloat spanLengthRcp = 1.0/spanLength;
+/*--------------------------------------------------------------------------*/
+    FxaaBool directionN = dstN < dstP;
+    FxaaFloat dst = min(dstN, dstP);
+    FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP;
+    FxaaFloat subpixG = subpixF * subpixF;
+    FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5;
+    FxaaFloat subpixH = subpixG * fxaaQualitySubpix;
+/*--------------------------------------------------------------------------*/
+    FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0;
+    FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH);
+    if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign;
+    if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign;
+    #if (FXAA_DISCARD == 1)
+        return FxaaTexTop(tex, posM);
+    #else
+        return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM);
+    #endif
+}
+/*==========================================================================*/
+#endif
+
+vec4 mainImage(vec2 fragCoord)
+{
+    vec2 rcpFrame = 1./invResolution_data.xy;
+  	vec2 uv2 = fragCoord.xy / invResolution_data.xy;
+
+    float fxaaQualitySubpix = 0.75;  // [0..1], default 0.75
+    float fxaaQualityEdgeThreshold = 0.166;  // [0.125..0.33], default 0.166
+    float fxaaQualityEdgeThresholdMin = 0.02;//0.0625; // ?
+    vec4 dummy4 =  vec4(0.0,0.0,0.0,0.0);
+    float dummy1 = 0.0;
+
+    vec4 col = FxaaPixelShader(uv2, dummy4,
+                                    inputImage, inputImage, inputImage,
+                                    rcpFrame, dummy4, dummy4, dummy4,
+                                    fxaaQualitySubpix, fxaaQualityEdgeThreshold,
+                                    fxaaQualityEdgeThresholdMin,
+                                    dummy1, dummy1, dummy1, dummy4);
+
+    vec4 fragColor = vec4( col.xyz, 1. );
+
+    return fragColor;
+}
+
+void main()
+{
+    ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+    for(int i = 0; i < 4; i++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+            ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+            vec4 outColor = mainImage(texelCoord + vec2(0.5));
+            imageStore(imgOutput, texelCoord, outColor);
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv
new file mode 100644
index 0000000000000000000000000000000000000000..b466bcb659d56d910d30aac5d781d27b8ff1313f
GIT binary patch
literal 25012
zcmaK!34m5r{r+Es0mKdWeL-<cGxsGo5K%Ei6cRH_M;LHyU~mQ$Op|d-%QCafEX%T7
z(A>&>gWNJTx6DjaE6d6@Tilo5=kvbz84tJrzuTP7^E~H!zRS7iz31gJIB=<rhBO8a
zXbfu%ZEQTK(Y#h{EY%nQTBb3$+3!1k()g`<=S<sb`yIB{VEM+trkyq`VADqL)S2y_
zvmE#{!^6?lR-_HDn)U<v-%|8fL)x$?=e03s8OHXR*WNyU?#%Yhj+?&yu5j&!(f7<a
zs+|tLYGWk5b`v}2b#zTSx_w$l59ZtM>XhDXx9M-u)NRwZssFTsw2{{;_?+F{i~k`+
zXf&qde@yqR-tMlBiQT>YG-D5JtVTblV;*+H8mlj9x806gPwtpEcHZp0I@^0Ebj&z#
zR)_fxYpn6V?Is`Jt=*8unw8x_bKAQ*=gdEN>g;`b+Gln!Hng$Ul0Gta;Osfh9oks?
z|2w`d<4ZTzS#q3Q3m4nA;;*}8{7G|<n%y}M&H3vsIW~6MjE>1i_jL3g-Q6{fp{C9H
z|HI}0wugQ;sBEgW)em>DXZEa)UYEFOGvdE({wvmo|HGygYiMJmC2gqLsB`x2>@m*Z
z#%7GWZ^zASpV86WLF#RdO`X@%K4VOGcaO!n2i^Saj|1qMd}n(0<a`Ckcc<@S-)=v;
z#m9v2)#CfZXSUCq6#I$t&RPBZkk<HE1^pe~8t><Be&$b^K=1gB)_BF+8Z+TN?XzZd
z9MJA9!=|lqY`@m}Y+Cc&evM?Nws&<<p#J7ZZ_XXuI4K-|W4hXB&c;xmr=oYX_jJu4
zJFk<i*PMGcy|qsm+v4ZKXScI;tMT)y@kvu!;}_J%FRaEVOlpl^T=6MQe}fyB!h4&W
zkv)5`MQYd9SlF*mnAEauYg~=i!KT`;yXS;$z}nqVX@_+5%!zgzS}W&CL}}SAiZ;bc
zTH1Z|@yExc>F=RR(_gg5(VG6ISY^xZsY;tP<^34XqQ&3#mEDV#roU(}7k1lKcCQq6
z(OxU;fX)2gF6>$w&+-9T!wF<KVXPUsHf`u#9kXW4IeJp(44&X>L(z_A{U*(BZ$8uA
zAItX7AC1MJ#>jrYS98u5@EJYr(>gn5&1vn?w#GJv#;tA6+X=0?P8jKD_li%d?)@0}
z?Do!{_!w9CXw@>PF&VA3AI3C&9Oj(vUOs6ppY8DO>C-3p88gONt#Ms8`2{WRaFbsK
zZ~5CB-qyGZZjNJTP3xa?t#gV!eSQC&vCTO*zCWkm@2yMB8QfS5&l;7Vz5A-U2UMT^
zhv2Pz_G$WlG~?qZAJ);+jW($94BE8rSskr)8q|29;;H%Vj3a2jAqQ`n{}wgApvKRt
z@#kv%r5fLkbAMWI^VT=x?b71aTC_C|g3svgp5~s)y*&i27e2nZ|ID+kaadv3s%3EF
zh$ZZJ1~l(mTjMD7X`MYCQ|EMc>(q06x?1m?rmtfPU%A)a`f5H)_HEkq!a3zSx(=K^
zy|=@8JPZ46_Ji{*T+(J=%0M5|=vz{s^!|C-*0_wZo*dxKd<Qo!N9&w*+`%1utC(xf
z=z}@C)xDM&J7QY<oc2~t3#)PKyAC|}IM4JUjT^y!PpQ5y6!DjC+`QzN=j?x-p|}1I
zo7NdRsBt?s)8@{cIiClUYjbBf4^s2F3qG~G3*6RN1n!(Qd+wa(f!eZPj8^WuyKDU3
z8o$5BAE@yMYy9CFf277AtMMml{K*=Bs>YvQg10rEg}1&NbanGJqPYeyp>=gn1rKe0
z|DDlw#J*kKN40m2n{{0G)OPRmxV<ae*RbuZFw5CEriqye=6hvFSM{aC^;`+9d+wYu
z-K|h4gBq)&y}zDqjkV#OGiMw)cMj{0FL!MA(?)QweYsC+d}P6c8t$hJn<MY@5%7$A
zA002b<2+KD^ZkzRcXhv0>dzdsNABm^dpSOmaqaznyE0z&I|6I%#jAk^({Dgq9s42l
za;BjTVpSVWO??D>X?n-iwO_8{`qF-RdTrI$#CipK?FZ1xD|~I@4`)C-=dV~9smWKO
zPfjDKVC9M%#~idzi>7XK?*FSfW-Xi0>)Uy&!aqR20nKZRif;in*Kj_1%}Z`w{yE9Z
z`J?DJpv7K3y5g?YK43N1%K74bY434u$=-2%ze-b2Ouy7PXBGaxkKT1Th2GrdnR^b{
zc+QP>e#O<3^JU;hW5J%`*ED%!QQ|FzyFW7id~1AE_$C7yjVl)}+&%mRI9I>Fg-t%H
z-~OH^U(nBAZt_L__CpNT7&Y28x(MvLDy_%=``K|XS@RWO*FkO9iu-v|GnY~HSHiut
zzp5H@esaC3IX<dC_AT(2-r9VRjNb+y_S7Z2ZOQ7ozIVXA$4j=>K3&xG)r$28u=d_V
z>-Zzw{iKPx{0U&$>gF!@v#jR$%N2Hxx!C^=EuX_z!TKLnjoZIljeD=HaaD}H7dAh8
zYr<ViW0aTc6~Ai<xU@Adxz7}@<gpRl&*PSOGk>2cYBoO~I}$%Worg)|b(A)GeY!S2
zr_En(Ek27&ZbISO`%DgRewGXF=hbJi<9;J{?fyb@J^g&{jx*Qqua$lpHuAs2)m+yT
zu^EN-53qXtyaLvbb;Zv=;p*}8FR)ta=ihMkiNtab{0FRW^*vY->sJ$7yZiJF*64M5
z*UJ7?a(a{gEt;3(Z&%~?<}(WY9k4#GBUT^$J+PYl;Z1DhZ-{C778{qL{<s6cj@cV`
zApIbkmvP&war?v_>^LoPhk(@*SH2WYU-}liuHNsZ!LC8>&nR3CLsO6cWx&?8fH{sY
z3)e<HewG94C-+CY<<ZpRX9ci%#?NrDHtM-QD}wd!m}}#HQ`3j{Z)LDrH{8$GDqyuf
za+9wHR`WCGeD|Z;E{wY`*8p!uukN_tC)D)sXKfvtpR>eU7i>Ja^Vb7AXDVD<$JM+y
z-Y3`BUi(qluMY+`Qu792wb97A_iE11z4t!J-EaD{kG~O&mG@#pxLW)<rsn+YVfWiR
zJZ*NS`AqQHvkcA4?+rV-&h*1*##@%&`|26JWu@o7I`@N>{w}uS_;r0o(!BgWpkJQ}
zt5nz+zDHE|yMgZwgI6!`8Wnb~-;a`?@B4z+F0kLZ)XmrTb-})$3-&!+@CF4QQQ!?L
zY@Qp@T#JqAeKwlQ)--FBYvcQ_lKZ}^<YNkMZu{5x#2WW~SLSPfc#XH$_>3B#S>wmn
zxbL}2|GwWUx$n2aPXV7=aP#q<R>rOW+=4rPUcnvromT7}_nlVBFRt-RYkXmiUtQzA
z!z%sVR^y9m{Jt9ZomDyi@f!DCRXP4_jlWpqFW2}hHST+<(%;)PK48F-<GzQAz4ya+
zP~onp@1IIOvc`S?RF3=ZspP(QD!K2S!j0#9r*Px>-l^oia|-wKaafJF7u@;NYJ6tF
zo$ouT(x307!kzCssgnCnD%}0%`>1g3eIHeF-$#XOe`~?DUtHt9gNnW5zJm&PeS8O1
za^F9d{Dm6#{Zqz`@B62c`~Ioq3u@f=PvyAppGxlgr*QM#j^}>KcPaQOj2~2R?GGuq
z<A)X8ao;(`pX0u33U}Q1OyQ0nQ*g(9&y;a_Z;j8d@sn!YcT3LC*|aU#vt=8a=c8xS
zb~N?a@QSAS+}6hPR<2FPcL3|7Jl}SNtH)+1u+M&N^dZ;An127>h2~i%_dCdkY1$u3
zw&uGVO&j&4X#SZ+Kb{|s#m^q#($Ai7{TxYU`q_)7jr!;&9?0C59v^$dOCP?pZ}}M7
z^f3;ujk-SEhx^gYXAHgLsrw^f{g>-I9<Cmn1Hjf%uHOW>`m#jyo*YPH>o>o5Ip1;P
zc^*0zUk8Ef@eW2)kIf`-J>FzA^~C!q*m$1Z&Uf5+6X_j`uS3Dco6_uC`{!femfqrr
z!Ob(iKMqbU+B&Z9L+Blg?@xg1zCVf9(p$cdfSXf%9|=z0+B&Z9!|9Kt#rIKQ?^T)i
zRJeL<rd2lOz3V_zPrT{i)Z%={jn_`^SbQA~uE*;{Q;*Ftl}#D%Q)udm*9A^}&Uf5+
zGw2<254&Gx(aa&d8|?bV&unn=*4A<T`2EMR%<BQ`JG>X{TE@>DaPraCas3=i?^xy?
z2iAA^@!<7nr&r&l=Yh3Rcm7;@wb-8kF6Vz5Zj4i_`6q(4QFs1)dbP~|40!#5pA0re
z>R15IdT8soeomraK#QN#!1@k99jwjG<k3fe22C4v=bu_>+B^PPn%ZlOokj2EpKI06
zq^bFxS?ru4^q-^cO!LnQV#l>}d^r6VXscCxIr=Zswx%7*lgasCqG_W(jJ7N79Gdx^
zOYgXUwsYTnnYIi~-SM+4EwR1=FJpZbUdH+wTpRVoIv=dB3+NqBtgnNOrSAB7^lJG$
zd;?tn%zP6~JvJACcc*RB>|1;PTVVCRs-|y)ou_V`3+dJ3<2zvEls>)-SC7plVB<__
z_N_SI1FI*__rcCnH_pZMYObAYdpXU$b7qy-72wM%UHePHYO%i(Y_8!~fiI>d-Veaq
zs5^fly;^d<7VN&+f)*dwfz|W5yB>TEO+7Y01RFPTegxJ|-8fg%t0m5j;Ch^!(9{#>
zX0UP86XzE2#k9n^6|9}Qac-bji~a3j_haJT(HvW{zB|#><MS@C^VIcu8@*b5{21)L
zUrbA!yTR(=_kfKXpZ9{bQP<}pdbRkxA6(xnKS5K^dOQGjJ=7EDr(p9-oCm?$sT=1$
zdbPxP7;Nt4p8OeHJ#iiZ8%I5H9tAI=CC+1D?bMC)5WQOBJOQq+(a+J;<MT=Id9?Wa
z1y~z(eLhaF7W-d<wNH$vn`2Af%U_|X$LBL(=c()SDSEZ|{59CUxh$Wy!Sv732G9o4
z9Lt)#0CwzoTIT)+?D}TzZ^4=CSaSFs_ywBd=I|`NTKxPT?7oekKY-)MvE=X)*s(H)
zKf+!6%>5HMa~(?#e+E0>adUW)UM*|+7qIth8(Q-ED_A}8{|0tnsK@5-VDD4n`~$3=
zx^Z5nS4*6Kg5Bq3oPWX96X)Mx<ESUjf56_m#Ca90ow{*ep;wFj>tJJNUETm^T^x(u
zn_%xp{JsU&R$afZ(W}M&9k8+D_g!%OIu^V4z^-xpYNV~Ye&427i~Uky*D-t`{5E*z
z4}xo>?tI@_sEy`aSP8qq;5F&Xbsd7Hu6^^J1~~qg23uSBFt|D0iN8MjW#HPVJKuL8
zYO!Aqyb!zGv*qDx<=$KYZl0-OI9NM%efl0nO`m=ES`loV%PYPTSk3P}tI&J-y+?gz
znwsBh#94<`!S#K-8k%}+R<CTP;Lkkd+W77^KJAm&8sIXoHQ{RJwKlz%d8w~OQ!_7d
zVy**r-y1V)xGtJ{Y}Tu6yocr`*T%f!(>{5v4>m9J7(wr49_ky=)XYPi7#o7!CtJ{x
z$3|%C#&n;^jhXrOvE3N#ean5{6s)!hEipC&?_7(qIhuODdw&4zTy^7)q*sfNEy3Q$
za=ku?rXHKE!1XvELQ_wit-;PyH;(V-)Z$}XaQ%6-9h!P<wg=ba?0}}8I6H!!r*0hI
z`KiUn&ft1IyP&DZ=EL&Zy6uXl-pA+C^JzD*^VE&wyF|74*aKXzXHRVln^E9;oY83N
ziL)2jdFsaT9i>|K!QNokGS9)WV1C$ryypPKZXd8~5xafC{50(x%kySGus$3&-WYnd
z_}L#^#`_4IpQazjVmBUK#yg;}b1Z*$n^5VS;X_~A8Fw7LWBTi3>_D)&_!&Eh-pkLJ
z`b3(VpD}TAI2hbUI~hCg*(9)<{?+AvzB1Q7b3Y0mNy~a20_LY#r(?030=^VIpTk4p
z+Nv+hpW%%AF)%;P&${Egf-j?)$7HzU**6~tZ_QYFejJXb9-B{q52NMzB-bY6p9Jfp
ze6}2crXHIk!TQj~JmlJBy^jJ9rR5Bm3RZLdr_+16{^}jH88p{ltiNgW*6I3c@0d1;
ze>Av$PIjWHXTH3gub-^@F<@iF->1Osw5*9-o7B?<wjT5EruQ;`^;xvpG;<Uie<pqM
z*WNL067N`W{rTO4rk?q7^Ur+y<lhT6M*PhIn}6<yT$|)S7hK+t<KSx6Gmk#?s2@*L
zGiPyP&Ih~ivsXV2Ry%=~JWd2>Z)M&|XzG2{v+pxtHRosE$>7Yhez|tmpIq!y&jN7n
z$0^|Cuii&fGk<aNI2Byxe;Qn^%>Q(_d6awq47hso|14O|`N{uGaORo6Ts!kmF7~ni
zTxDN=PCt*P9-Fhl*^{Z^3ux-O2VVrM`8>!Ob`D%$j>XrPz-Q62zvbFx{9JJIPpmJa
zsmJC#aQ0YizJjKne7*`+EAu%Yt}n;p>uX^1$$ph<m+`NI&8J+S3*qXq`3AVYKHo%B
zPd*oc)y$`_+JE1I>&vnD`Zo9iTGmIdUB)j4*VpGeXzH=~F1Wrvm!PR9pYMUy%6z^L
z*Oz1Qbt%|<vOaR{vhJ6IC)48h3b23HQtzwkyb`RwkmjDej9yJ2!?FDVcr|+Ya`acj
zy(js5mutY<s4vC1&olj83wJE<@~#6f!<f3`SJA5_*B^q-HT*{vPwqFswNZEe_4I1V
z^Coba=gn}}CVAch)<(U|^H#WH$@4a_d8!-dMtU{l^r7Dl9z)BSbq83@XXIV<UOw~H
z@1&{uj1i}{MPi!%ccPd342Z3LY<~<MN6R|g4OVk+<UQfNaP@m=`dmz}mVI(Rxc(jR
zCur)4^8naf66dF2wG(K0W;_UXo@2(jk6tZ49tOLA!hZ(#KE>xFU~SZ${}8=e>>mT`
zGw;eD2diaIJqlKf{S)9a=byvX%sDnsg57JW;TK>vZOy?kwb(xeE`9zIu2%Yd8t%S`
z&tHMnwAH6$YVr9j*f{x~_iHde>=NEb0AlwX*n5}n$<KqeRoCw`^lGvH4cJ)m`&%$S
zO}~!C?swqQ?~8D4)%E)Vy_)uY%>O;uSlQ!$0IRvj|48rU9#?;frsf_OXV3i!+)gW>
z&3}fgXFvP}toAZ3^Zp7xi&oD28(cl>{dcgM^E2-s;LNioxpsc8lAnEQd<9(A_)oZ+
zHU67EHLCxMre=-e#QYD~8h!p|tzU(!r^eU7YGsYD!_Bpv_Xb=&HNFW}bAIBz1<pKc
zlsE0jEBV={#&^JFjqk$MGX5Ued!PG^n}*t@oDF@{Gyv`#b>qDaR?GaQz~%gba5dMk
zjo!<BqCSYG<{FAq(_nCYpA11$&l(N|tCefGH2ebcDfh`RH1)pfY+DAb=KRE47Myw3
zB-hS;lKkvb<8t7##^vE^);OF#HL9;bQ?o{KVy+0T?~|3#)KlZiV70QwRp91Y?vqu~
z)KlYXU^V9_-s<4Yvqrgg)|mY4Q{x(7@2xeh1x`)sYtq!LNt`)rgUh|O4qQzi>(R%D
z`nohVeTenpcZ>DG<(a(!-2I(1djwn?b$<HK0e`2*v7Ff(fjzU;t=DI*n)N2GeR3ZO
zF7N5aa5Zz^l-|pItG)?M&D_PweKT;G`{r=hJh^`Wu8lfB{ki-5M~)@;Ey3olZtgzM
z)yzF{?UVZl!R5Md1y?iot?9kYUHwBeHFFp1!*>PSfL){TZQ<paw;f#V1Z?s=+8*va
z$Fjz1=9w7w$#VyAxyC!f)y#8edN1=---)JXp5o-S3)uAt|1iAFb62=pndfeB=Q)=3
zR5Q=Su#eB(!LCKl&^_Q#GQ*~@+Y`K)ft<~w;M%Gi$9tj{zoWtC9>06R<JYm+jRBW_
z_l9e$u3y(sP5VA_8VfG(|2}Xv*L6R7FV|IlUz(chD$es^9Jv0>-5*Uo&!dlk)qKaF
zdE?<ZW0U^@XzJM~6ToWD&%6V{nP*LM?Yt++&ptIy1eY})1Xr`hN%UUUsD3a_%^Jmt
zIT>7k=6)1SJvANzRx4|q0?!$n8V^NNPmLb~t2saM4g+VNHOjTK#^h(88b1y$YdjpT
zmhn%3%g@gz;c7XXj{rMI-FQB~)iVD`a5=vnuI3s}rOz6wA4OAh4aKQx8o0hsI?&X!
zhSR}n<r>a_=Zwu79*w5nSDnqBU^V9_-Z9|JvnIKAu3_@CPmP}fmo;|5)vR$AeQH#n
zNmH{%abk9Z>-%IjntEzH7OYm**aOcQn;Lu3)KlXeu$uD|Z!S3VtWmC=H6}m%)OZ{?
zXY)L8YEnO*re;mz%$W}^_tpt;HP_`t`uI@)G)+w(Vtsfvp9C(?x6i=c-#On-hHInF
zPyhM00ODBAw?44vo4WP-tW~q##I;ZEr+~|QdMaGa+)t<X^0Tjg8cogI#mW5)aGCpO
z;jVddKNGHvIzRonp9OI&xql99?&{|5^IXl`6W2bue;!<}``K_cbN?c}m$|Edfu?5e
zVtsfvp93z><}bm^Gw)ouT6yMu8SXsCvc_uWnHcuT^E`05#$SP}ndkZRUgoL(RhpW4
zij(Kpz~$L|0lduf>u|L)&kNzsb1dtrW}b;*pBUc&muK@g;oJf;<NaM>cM*6o138<&
z1=m*HINlSr`29AxJex0u$FF0t`wqDD`(3!U>iTsJ)wJ&;r%S-){r?_Z&2_z$-ph4W
z|2|F4brt9Na2dG%%)J~<J<p>nz-sppKl2vCbH*nBE78<*Ze0adbAIOi0GxT&B-hS+
zlKkvb<JI7@#%the)_5I#YE-|Lre=-e#JnC{f9C!WO+7XK2&`7tcmq6VY-+p_O+7W<
z1Xgo?;@u3+JZqF|XN}3vJ~iF~E^E9Mu9orJz~$%XcDP#3<~zX7Q8%8?Z?(+76I{-}
z3$ErGE~d{KsxP9cxrXA@^kZ;+pWKb6o;AD&tX8h!z3`l|S;PC#)cdNl`F^mP^AqnU
z;LNioxpuB$^0QBk4}i-We+pN##)s%rqxyq1HER?n=ELCnKKU7%dTM+GtX9_eC_HCu
zYJ3b$JvBZKR&##hJps-<Ym{qejmghGHU1o&v-ua`)TI6-P0gCbne!C5+*`kdtGO<}
zqK^;tr)g^X5bMLU`5AC|zC8<ff9HJrHC!8Ye)`Y1=OB*de0v`3`KE5YK5NyiH*xKg
z`wQUmp8iHoGxy)od-;4*|1C|;+{MZLMR1w>@8Pa_a{mKd8+CsAbAJiqSaSa(*xc34
z-RHTQxhJlDa{m*!T=zf2)y(}b^j_w!{xVI?+{OCvZ2l{_Je&UpFVDQc!_~?&?;mjI
zIhHk6Gtb1ZPoA%U%QgNdT+KZHP48u%>i?ptnWs2;{s&y1&9B1CJYR#Wm3h7ncb;Qe
zPc`#Q4Ew}*16-cXZ^F3+WX7Dr?k(_Q268sP4cAuPINlSr_<aXlp3U#V<JYm+y$3G+
z;;Nyox_(_lHSPO=1Hk3|Uka|~x(=fEa$VI2($rj6ah?xt;QBLnFq(RvM?=ACLui?|
zG&pB$@*jq#o^xv%u$uETZ&`5WS(98l?@98rPmRlg%Nm!5t6Af4`qZet0!_^t#fiBh
zxc<yt2~9mUt_)TyYg`4MGd4A@il&|#R|Bg#Kk-%vXP!06wX??LXP+9^0GBnc30KSb
zTHx~Yvo>5UXY)E>=cpUc=eJtsuL~~cuLoCi4L6|A8mg~PQ*#Z)sc8hbzE3tpQ_mW1
z1Xe58a3nluY}RmNH1)pfY~BQ{=KRFl6r6e1B-hS0On&yMaWimP<K}QRYuti9HL8Dr
zre=-e#M}~G-zOhLQ%{Xsfz`?yKLpPin;N%9Q%{ZCfYqFzc-w+A&l=_0S!43Em*?EI
zm(Qa&e|z~_6}ON7?U}nZg_nPyumhTUY<2|a^O)RsLQ~Hk-x=&Yb@TM!hpNTLhrzj5
z@v$qKdTe$B*K^q&O+9h;06S0JIR3j+wcO)9!9&^G`MaG_a5eAoUi7)g>Z56Du8}x-
zi~*a+7PRELH=25E#)9+tjm<u2>dAFqu=CWjCTiY`#I;ZE`+>{c$HCRi{Uh{VKC9LD
zr>U8{IC+f+XTKz`1JKlCGXY$$??5#5<USGXJoT)znz<*gePSF8_FT;0VNZgq`FGf<
zZ!+A!%l<x}=|1$2!nIR3#zFLI@i_(T{S7}9?7r%&o(UfVYoqS`L+I6F|8cNqYH~Up
zu2$ys33!>)C*j(u>+>*rwfH;|Y@GZKLOa;sNl@4R2zs^HPX(KE_%yIJC8rLsHtNnl
zie4@DGr;Biqv6ia?|62CDY5zQQMGkkKhx<Q(~oQYDX_l7yTGn@_)M_1Ucom$?Pr0V
zqn%^=JceF9xy=SYMDXxqE1u_I4_q5{=XcYq#eNRhJj3ULUBCQJ>2YAS_&Xl#9QEvh
zd0=(F(;1@|tQMaqfQ=jeY4Ez#7oR7>wNZEee0nwQU5C$rtvmBi1{=e5^1JZ@ur})X
zeT|dA>X~y2SS|ciuyJw^PXm*F^WQLQ>$th}(L0u0P6wCIhcn=6J|E7c_wxCm{#lxu
z&j)d0p9R)$_~*cF1kZYW9<Gggo)>3>>(AvcpsD9Rd=aeX{LDKCoO#wG*Uo21a<fl;
zUjkd7^?VtedeqOQsacOW`JV?a>-h@2tmmt6ZPb(3`QZBV?`vr4spkT)n)5U7>)_0@
z9=UeblicjhZ8*ta2rmE3_YJu7^Jl(q!nIN7NN(<*1Npt?i{Rg)Id1cEe`u3jzKvf0
z-KLAt)DNnjkKY07$1B=*D@}jVE~zyAPi(aQy8_=sbH4VD>)U^4t6u&$dHmgSuT5w>
zu_nvYZ%Xq%ZBC#6zRhLK8&dG=3huw{xv|D?uJKz7J{0@g3-0_o3-0(`1vjU~1$X@J
zf;)b1!5zQ9#viEh2Mccehim+i8h@<DpQ!OCYy7E#o6plV{%pnF4_ASIKwC(=lIH$v
z{?1~h``^EkUjz57vUz@~U5loEHO)QkeNu~$>%rd7#Q7n-9_L4B>hXC4*m>&u^j@op
zz0WsQ*n50Sg}uMGRoHubM}@twIj_z4$8i1W%X=y}SNGOEVCOpK{gtPV`@q)bnD<(q
zc|QSXp7&jzv7Z)W?g4qm9xBG%4|2ym7k&n|R^z!h<QaPuZ0(M@PvjYU9PC;+=AMyf
z?B`(D!!h@dJY&Bo#@tKtjQz40b6?3FbIpDQPVMe7`7ZodOB{O!Jd)mZc0Csr%{6qb
zToY@zMr$!=b2A5H>C=4uJnAoTe+^d4^X@sYn$0WEyF8<wN7u$H+6$FdKFfYnY2~x*
cw`k7S-to)uVNTD29ZOEX1FQL*6Z@R|KY~1(qW}N^

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl
new file mode 100644
index 0000000000..a518cf25ea
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl
@@ -0,0 +1,1404 @@
+#version 430 core 
+#define SMAA_GLSL_4 1 
+
+layout (constant_id = 0) const int SMAA_PRESET_LOW = 0;
+layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0;
+layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0;
+layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0;
+layout (constant_id = 4) const float METRIC_WIDTH = 1920.0;
+layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0;
+
+#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT)
+
+layout (local_size_x = 16, local_size_y = 16) in;
+/**
+ * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com)
+ * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com)
+ * Copyright (C) 2013 Belen Masia (bmasia@unizar.es)
+ * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com)
+ * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to
+ * do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software. As clarification, there
+ * is no requirement that the copyright notice and permission be included in
+ * binary distributions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+/**
+ *                  _______  ___  ___       ___           ___
+ *                 /       ||   \/   |     /   \         /   \
+ *                |   (---- |  \  /  |    /  ^  \       /  ^  \
+ *                 \   \    |  |\/|  |   /  /_\  \     /  /_\  \
+ *              ----)   |   |  |  |  |  /  _____  \   /  _____  \
+ *             |_______/    |__|  |__| /__/     \__\ /__/     \__\
+ * 
+ *                               E N H A N C E D
+ *       S U B P I X E L   M O R P H O L O G I C A L   A N T I A L I A S I N G
+ *
+ *                         http://www.iryoku.com/smaa/
+ *
+ * Hi, welcome aboard!
+ * 
+ * Here you'll find instructions to get the shader up and running as fast as
+ * possible.
+ *
+ * IMPORTANTE NOTICE: when updating, remember to update both this file and the
+ * precomputed textures! They may change from version to version.
+ *
+ * The shader has three passes, chained together as follows:
+ *
+ *                           |input|------------------ 
+ *                              v                     |
+ *                    [ SMAA*EdgeDetection ]          |
+ *                              v                     |
+ *                          |edgesTex|                |
+ *                              v                     |
+ *              [ SMAABlendingWeightCalculation ]     |
+ *                              v                     |
+ *                          |blendTex|                |
+ *                              v                     |
+ *                [ SMAANeighborhoodBlending ] <------ 
+ *                              v
+ *                           |output|
+ *
+ * Note that each [pass] has its own vertex and pixel shader. Remember to use
+ * oversized triangles instead of quads to avoid overshading along the
+ * diagonal.
+ *
+ * You've three edge detection methods to choose from: luma, color or depth.
+ * They represent different quality/performance and anti-aliasing/sharpness
+ * tradeoffs, so our recommendation is for you to choose the one that best
+ * suits your particular scenario:
+ *
+ * - Depth edge detection is usually the fastest but it may miss some edges.
+ *
+ * - Luma edge detection is usually more expensive than depth edge detection,
+ *   but catches visible edges that depth edge detection can miss.
+ *
+ * - Color edge detection is usually the most expensive one but catches
+ *   chroma-only edges.
+ *
+ * For quickstarters: just use luma edge detection.
+ *
+ * The general advice is to not rush the integration process and ensure each
+ * step is done correctly (don't try to integrate SMAA T2x with predicated edge
+ * detection from the start!). Ok then, let's go!
+ *
+ *  1. The first step is to create two RGBA temporal render targets for holding
+ *     |edgesTex| and |blendTex|.
+ *
+ *     In DX10 or DX11, you can use a RG render target for the edges texture.
+ *     In the case of NVIDIA GPUs, using RG render targets seems to actually be
+ *     slower.
+ *
+ *     On the Xbox 360, you can use the same render target for resolving both
+ *     |edgesTex| and |blendTex|, as they aren't needed simultaneously.
+ *
+ *  2. Both temporal render targets |edgesTex| and |blendTex| must be cleared
+ *     each frame. Do not forget to clear the alpha channel!
+ *
+ *  3. The next step is loading the two supporting precalculated textures,
+ *     'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as
+ *     C++ headers, and also as regular DDS files. They'll be needed for the
+ *     'SMAABlendingWeightCalculation' pass.
+ *
+ *     If you use the C++ headers, be sure to load them in the format specified
+ *     inside of them.
+ *
+ *     You can also compress 'areaTex' and 'searchTex' using BC5 and BC4
+ *     respectively, if you have that option in your content processor pipeline.
+ *     When compressing then, you get a non-perceptible quality decrease, and a
+ *     marginal performance increase.
+ *
+ *  4. All samplers must be set to linear filtering and clamp.
+ *
+ *     After you get the technique working, remember that 64-bit inputs have
+ *     half-rate linear filtering on GCN.
+ *
+ *     If SMAA is applied to 64-bit color buffers, switching to point filtering
+ *     when accesing them will increase the performance. Search for
+ *     'SMAASamplePoint' to see which textures may benefit from point
+ *     filtering, and where (which is basically the color input in the edge
+ *     detection and resolve passes).
+ *
+ *  5. All texture reads and buffer writes must be non-sRGB, with the exception
+ *     of the input read and the output write in
+ *     'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in
+ *     this last pass are not possible, the technique will work anyway, but
+ *     will perform antialiasing in gamma space.
+ *
+ *     IMPORTANT: for best results the input read for the color/luma edge 
+ *     detection should *NOT* be sRGB.
+ *
+ *  6. Before including SMAA.h you'll have to setup the render target metrics,
+ *     the target and any optional configuration defines. Optionally you can
+ *     use a preset.
+ *
+ *     You have the following targets available: 
+ *         SMAA_HLSL_3
+ *         SMAA_HLSL_4
+ *         SMAA_HLSL_4_1
+ *         SMAA_GLSL_3 *
+ *         SMAA_GLSL_4 *
+ *
+ *         * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below).
+ *
+ *     And four presets:
+ *         SMAA_PRESET_LOW          (%60 of the quality)
+ *         SMAA_PRESET_MEDIUM       (%80 of the quality)
+ *         SMAA_PRESET_HIGH         (%95 of the quality)
+ *         SMAA_PRESET_ULTRA        (%99 of the quality)
+ *
+ *     For example:
+ *         #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0)
+ *         #define SMAA_HLSL_4
+ *         #define SMAA_PRESET_HIGH
+ *         #include "SMAA.h"
+ *
+ *     Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a
+ *     uniform variable. The code is designed to minimize the impact of not
+ *     using a constant value, but it is still better to hardcode it.
+ *
+ *     Depending on how you encoded 'areaTex' and 'searchTex', you may have to
+ *     add (and customize) the following defines before including SMAA.h:
+ *          #define SMAA_AREATEX_SELECT(sample) sample.rg
+ *          #define SMAA_SEARCHTEX_SELECT(sample) sample.r
+ *
+ *     If your engine is already using porting macros, you can define
+ *     SMAA_CUSTOM_SL, and define the porting functions by yourself.
+ *
+ *  7. Then, you'll have to setup the passes as indicated in the scheme above.
+ *     You can take a look into SMAA.fx, to see how we did it for our demo.
+ *     Checkout the function wrappers, you may want to copy-paste them!
+ *
+ *  8. It's recommended to validate the produced |edgesTex| and |blendTex|.
+ *     You can use a screenshot from your engine to compare the |edgesTex|
+ *     and |blendTex| produced inside of the engine with the results obtained
+ *     with the reference demo.
+ *
+ *  9. After you get the last pass to work, it's time to optimize. You'll have
+ *     to initialize a stencil buffer in the first pass (discard is already in
+ *     the code), then mask execution by using it the second pass. The last
+ *     pass should be executed in all pixels.
+ *
+ *
+ * After this point you can choose to enable predicated thresholding,
+ * temporal supersampling and motion blur integration:
+ *
+ * a) If you want to use predicated thresholding, take a look into
+ *    SMAA_PREDICATION; you'll need to pass an extra texture in the edge
+ *    detection pass.
+ *
+ * b) If you want to enable temporal supersampling (SMAA T2x):
+ *
+ * 1. The first step is to render using subpixel jitters. I won't go into
+ *    detail, but it's as simple as moving each vertex position in the
+ *    vertex shader, you can check how we do it in our DX10 demo.
+ *
+ * 2. Then, you must setup the temporal resolve. You may want to take a look
+ *    into SMAAResolve for resolving 2x modes. After you get it working, you'll
+ *    probably see ghosting everywhere. But fear not, you can enable the
+ *    CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro.
+ *    Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded.
+ *
+ * 3. The next step is to apply SMAA to each subpixel jittered frame, just as
+ *    done for 1x.
+ *
+ * 4. At this point you should already have something usable, but for best
+ *    results the proper area textures must be set depending on current jitter.
+ *    For this, the parameter 'subsampleIndices' of
+ *    'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x
+ *    mode:
+ *
+ *    @SUBSAMPLE_INDICES
+ *
+ *    | S# |  Camera Jitter   |  subsampleIndices    |
+ *    +----+------------------+---------------------+
+ *    |  0 |  ( 0.25, -0.25)  |  float4(1, 1, 1, 0)  |
+ *    |  1 |  (-0.25,  0.25)  |  float4(2, 2, 2, 0)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. S# stands for the
+ *    sample number.
+ *
+ * More information about temporal supersampling here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * c) If you want to enable spatial multisampling (SMAA S2x):
+ *
+ * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be
+ *    created with:
+ *      - DX10:     see below (*)
+ *      - DX10.1:   D3D10_STANDARD_MULTISAMPLE_PATTERN or
+ *      - DX11:     D3D11_STANDARD_MULTISAMPLE_PATTERN
+ *
+ *    This allows to ensure that the subsample order matches the table in
+ *    @SUBSAMPLE_INDICES.
+ *
+ *    (*) In the case of DX10, we refer the reader to:
+ *      - SMAA::detectMSAAOrder and
+ *      - SMAA::msaaReorder
+ *
+ *    These functions allow to match the standard multisample patterns by
+ *    detecting the subsample order for a specific GPU, and reordering
+ *    them appropriately.
+ *
+ * 2. A shader must be run to output each subsample into a separate buffer
+ *    (DX10 is required). You can use SMAASeparate for this purpose, or just do
+ *    it in an existing pass (for example, in the tone mapping pass, which has
+ *    the advantage of feeding tone mapped subsamples to SMAA, which will yield
+ *    better results).
+ *
+ * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing
+ *    the results in the final buffer. The second run should alpha blend with
+ *    the existing final buffer using a blending factor of 0.5.
+ *    'subsampleIndices' must be adjusted as in the SMAA T2x case (see point
+ *    b).
+ *
+ * d) If you want to enable temporal supersampling on top of SMAA S2x
+ *    (which actually is SMAA 4x):
+ *
+ * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is
+ *    to calculate SMAA S2x for current frame. In this case, 'subsampleIndices'
+ *    must be set as follows:
+ *
+ *    | F# | S# |   Camera Jitter    |    Net Jitter     |   subsampleIndices   |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  0 |  0 |  ( 0.125,  0.125)  |  ( 0.375, -0.125) |  float4(5, 3, 1, 3)  |
+ *    |  0 |  1 |  ( 0.125,  0.125)  |  (-0.125,  0.375) |  float4(4, 6, 2, 3)  |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  1 |  2 |  (-0.125, -0.125)  |  ( 0.125, -0.375) |  float4(3, 5, 1, 4)  |
+ *    |  1 |  3 |  (-0.125, -0.125)  |  (-0.375,  0.125) |  float4(6, 4, 2, 4)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. F# stands for the
+ *    frame number. S# stands for the sample number.
+ *
+ * 2. After calculating SMAA S2x for current frame (with the new subsample
+ *    indices), previous frame must be reprojected as in SMAA T2x mode (see
+ *    point b).
+ *
+ * e) If motion blur is used, you may want to do the edge detection pass
+ *    together with motion blur. This has two advantages:
+ *
+ * 1. Pixels under heavy motion can be omitted from the edge detection process.
+ *    For these pixels we can just store "no edge", as motion blur will take
+ *    care of them.
+ * 2. The center pixel tap is reused.
+ *
+ * Note that in this case depth testing should be used instead of stenciling,
+ * as we have to write all the pixels in the motion blur pass.
+ *
+ * That's it!
+ */
+
+//-----------------------------------------------------------------------------
+// SMAA Presets
+
+/**
+ * Note that if you use one of these presets, the following configuration
+ * macros will be ignored if set in the "Configurable Defines" section.
+ */
+
+#if defined(SMAA_PRESET_LOW)
+#define SMAA_THRESHOLD 0.15
+#define SMAA_MAX_SEARCH_STEPS 4
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_MEDIUM)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 8
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_HIGH)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 16
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#define SMAA_CORNER_ROUNDING 25
+#elif defined(SMAA_PRESET_ULTRA)
+#define SMAA_THRESHOLD 0.05
+#define SMAA_MAX_SEARCH_STEPS 32
+#define SMAA_MAX_SEARCH_STEPS_DIAG 16
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+//-----------------------------------------------------------------------------
+// Configurable Defines
+
+/**
+ * SMAA_THRESHOLD specifies the threshold or sensitivity to edges.
+ * Lowering this value you will be able to detect more edges at the expense of
+ * performance. 
+ *
+ * Range: [0, 0.5]
+ *   0.1 is a reasonable value, and allows to catch most visible edges.
+ *   0.05 is a rather overkill value, that allows to catch 'em all.
+ *
+ *   If temporal supersampling is used, 0.2 could be a reasonable value, as low
+ *   contrast edges are properly filtered by just 2x.
+ */
+#ifndef SMAA_THRESHOLD
+#define SMAA_THRESHOLD 0.1
+#endif
+
+/**
+ * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection.
+ * 
+ * Range: depends on the depth range of the scene.
+ */
+#ifndef SMAA_DEPTH_THRESHOLD
+#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD)
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the
+ * horizontal/vertical pattern searches, at each side of the pixel.
+ *
+ * In number of pixels, it's actually the double. So the maximum line length
+ * perfectly handled by, for example 16, is 64 (by perfectly, we meant that
+ * longer lines won't look as good, but still antialiased).
+ *
+ * Range: [0, 112]
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS
+#define SMAA_MAX_SEARCH_STEPS 16
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the
+ * diagonal pattern searches, at each side of the pixel. In this case we jump
+ * one pixel at time, instead of two.
+ *
+ * Range: [0, 20]
+ *
+ * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 
+ * steps), but it can have a significant impact on older machines.
+ *
+ * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing.
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS_DIAG
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#endif
+
+/**
+ * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded.
+ *
+ * Range: [0, 100]
+ *
+ * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing.
+ */
+#ifndef SMAA_CORNER_ROUNDING
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+/**
+ * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times
+ * bigger contrast than current edge, current edge will be discarded.
+ *
+ * This allows to eliminate spurious crossing edges, and is based on the fact
+ * that, if there is too much contrast in a direction, that will hide
+ * perceptually contrast in the other neighbors.
+ */
+#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR
+#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0
+#endif
+
+/**
+ * Predicated thresholding allows to better preserve texture details and to
+ * improve performance, by decreasing the number of detected edges using an
+ * additional buffer like the light accumulation buffer, object ids or even the
+ * depth buffer (the depth buffer usage may be limited to indoor or short range
+ * scenes).
+ *
+ * It locally decreases the luma or color threshold if an edge is found in an
+ * additional buffer (so the global threshold can be higher).
+ *
+ * This method was developed by Playstation EDGE MLAA team, and used in 
+ * Killzone 3, by using the light accumulation buffer. More information here:
+ *     http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx 
+ */
+#ifndef SMAA_PREDICATION
+#define SMAA_PREDICATION 0
+#endif
+
+/**
+ * Threshold to be used in the additional predication buffer. 
+ *
+ * Range: depends on the input, so you'll have to find the magic number that
+ * works for you.
+ */
+#ifndef SMAA_PREDICATION_THRESHOLD
+#define SMAA_PREDICATION_THRESHOLD 0.01
+#endif
+
+/**
+ * How much to scale the global threshold used for luma or color edge
+ * detection when using predication.
+ *
+ * Range: [1, 5]
+ */
+#ifndef SMAA_PREDICATION_SCALE
+#define SMAA_PREDICATION_SCALE 2.0
+#endif
+
+/**
+ * How much to locally decrease the threshold.
+ *
+ * Range: [0, 1]
+ */
+#ifndef SMAA_PREDICATION_STRENGTH
+#define SMAA_PREDICATION_STRENGTH 0.4
+#endif
+
+/**
+ * Temporal reprojection allows to remove ghosting artifacts when using
+ * temporal supersampling. We use the CryEngine 3 method which also introduces
+ * velocity weighting. This feature is of extreme importance for totally
+ * removing ghosting. More information here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * Note that you'll need to setup a velocity buffer for enabling reprojection.
+ * For static geometry, saving the previous depth buffer is a viable
+ * alternative.
+ */
+#ifndef SMAA_REPROJECTION
+#define SMAA_REPROJECTION 0
+#endif
+
+/**
+ * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to
+ * remove ghosting trails behind the moving object, which are not removed by
+ * just using reprojection. Using low values will exhibit ghosting, while using
+ * high values will disable temporal supersampling under motion.
+ *
+ * Behind the scenes, velocity weighting removes temporal supersampling when
+ * the velocity of the subsamples differs (meaning they are different objects).
+ *
+ * Range: [0, 80]
+ */
+#ifndef SMAA_REPROJECTION_WEIGHT_SCALE
+#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0
+#endif
+
+/**
+ * On some compilers, discard cannot be used in vertex shaders. Thus, they need
+ * to be compiled separately.
+ */
+#ifndef SMAA_INCLUDE_VS
+#define SMAA_INCLUDE_VS 1
+#endif
+#ifndef SMAA_INCLUDE_PS
+#define SMAA_INCLUDE_PS 1
+#endif
+
+//-----------------------------------------------------------------------------
+// Texture Access Defines
+
+#ifndef SMAA_AREATEX_SELECT
+#if defined(SMAA_HLSL_3)
+#define SMAA_AREATEX_SELECT(sample) sample.ra
+#else
+#define SMAA_AREATEX_SELECT(sample) sample.rg
+#endif
+#endif
+
+#ifndef SMAA_SEARCHTEX_SELECT
+#define SMAA_SEARCHTEX_SELECT(sample) sample.r
+#endif
+
+#ifndef SMAA_DECODE_VELOCITY
+#define SMAA_DECODE_VELOCITY(sample) sample.rg
+#endif
+
+//-----------------------------------------------------------------------------
+// Non-Configurable Defines
+
+#define SMAA_AREATEX_MAX_DISTANCE 16
+#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20
+#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0))
+#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0)
+#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0)
+#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0)
+#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0)
+
+//-----------------------------------------------------------------------------
+// Porting Functions
+
+#if defined(SMAA_HLSL_3)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0))
+#define SMAASample(tex, coord) tex2D(tex, coord)
+#define SMAASamplePoint(tex, coord) tex2D(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#endif
+#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1)
+SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+#define SMAATexture2D(tex) Texture2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0)
+#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset)
+#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord)
+#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord)
+#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex
+#define SMAALoad(tex, pos, sample) tex.Load(pos, sample)
+#if defined(SMAA_HLSL_4_1)
+#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0)
+#endif
+#endif
+#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset)
+#define SMAASample(tex, coord) texture(tex, coord)
+#define SMAASamplePoint(tex, coord) texture(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset)
+#define SMAA_FLATTEN
+#define SMAA_BRANCH
+#define lerp(a, b, t) mix(a, b, t)
+#define saturate(a) clamp(a, 0.0, 1.0)
+#if defined(SMAA_GLSL_4)
+#define mad(a, b, c) fma(a, b, c)
+#define SMAAGather(tex, coord) textureGather(tex, coord)
+#else
+#define mad(a, b, c) (a * b + c)
+#endif
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define int2 ivec2
+#define int3 ivec3
+#define int4 ivec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#endif
+
+#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL)
+#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL
+#endif
+
+//-----------------------------------------------------------------------------
+// Misc functions
+
+/**
+ * Gathers current pixel, and the top-left neighbors.
+ */
+float3 SMAAGatherNeighbours(float2 texcoord,
+                            float4 offset[3],
+                            SMAATexture2D(tex)) {
+    #ifdef SMAAGather
+    return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb;
+    #else
+    float P = SMAASamplePoint(tex, texcoord).r;
+    float Pleft = SMAASamplePoint(tex, offset[0].xy).r;
+    float Ptop  = SMAASamplePoint(tex, offset[0].zw).r;
+    return float3(P, Pleft, Ptop);
+    #endif
+}
+
+/**
+ * Adjusts the threshold by means of predication.
+ */
+float2 SMAACalculatePredicatedThreshold(float2 texcoord,
+                                        float4 offset[3],
+                                        SMAATexture2D(predicationTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex));
+    float2 delta = abs(neighbours.xx - neighbours.yz);
+    float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta);
+    return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges);
+}
+
+/**
+ * Conditional move:
+ */
+void SMAAMovc(bool2 cond, inout float2 variable, float2 value) {
+    SMAA_FLATTEN if (cond.x) variable.x = value.x;
+    SMAA_FLATTEN if (cond.y) variable.y = value.y;
+}
+
+void SMAAMovc(bool4 cond, inout float4 variable, float4 value) {
+    SMAAMovc(cond.xy, variable.xy, value.xy);
+    SMAAMovc(cond.zw, variable.zw, value.zw);
+}
+
+
+#if SMAA_INCLUDE_VS
+//-----------------------------------------------------------------------------
+// Vertex Shaders
+
+/**
+ * Edge Detection Vertex Shader
+ */
+void SMAAEdgeDetectionVS(float2 texcoord,
+                         out float4 offset[3]) {
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+    offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy);
+}
+
+/**
+ * Blend Weight Calculation Vertex Shader
+ */
+void SMAABlendingWeightCalculationVS(float2 texcoord,
+                                     out float2 pixcoord,
+                                     out float4 offset[3]) {
+    pixcoord = texcoord * SMAA_RT_METRICS.zw;
+
+    // We will use these offsets for the searches later on (see @PSEUDO_GATHER4):
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125,  1.25, -0.125), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125,  1.25), texcoord.xyxy);
+
+    // And these for the searches, they indicate the ends of the loops:
+    offset[2] = mad(SMAA_RT_METRICS.xxyy,
+                    float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS),
+                    float4(offset[0].xz, offset[1].yw));
+}
+
+/**
+ * Neighborhood Blending Vertex Shader
+ */
+void SMAANeighborhoodBlendingVS(float2 texcoord,
+                                out float4 offset) {
+    offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+}
+#endif // SMAA_INCLUDE_VS
+
+#if SMAA_INCLUDE_PS
+//-----------------------------------------------------------------------------
+// Edge Detection Pixel Shaders (First Pass)
+
+/**
+ * Luma Edge Detection
+ *
+ * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAALumaEdgeDetectionPS(float2 texcoord,
+                               float4 offset[3],
+                               SMAATexture2D(colorTex)
+                               #if SMAA_PREDICATION
+                               , SMAATexture2D(predicationTex)
+                               #endif
+                               ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex));
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate lumas:
+    float3 weights = float3(0.2126, 0.7152, 0.0722);
+    float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights);
+
+    float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights);
+    float Ltop  = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights);
+
+    // We do the usual threshold:
+    float4 delta;
+    delta.xy = abs(L - float2(Lleft, Ltop));
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights);
+    float Lbottom  = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights);
+    delta.zw = abs(L - float2(Lright, Lbottom));
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights);
+    float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights);
+    delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop));
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Color Edge Detection
+ *
+ * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAAColorEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(colorTex)
+                                #if SMAA_PREDICATION
+                                , SMAATexture2D(predicationTex)
+                                #endif
+                                ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex);
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate color deltas:
+    float4 delta;
+    float3 C = SMAASamplePoint(colorTex, texcoord).rgb;
+
+    float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb;
+    float3 t = abs(C - Cleft);
+    delta.x = max(max(t.r, t.g), t.b);
+
+    float3 Ctop  = SMAASamplePoint(colorTex, offset[0].zw).rgb;
+    t = abs(C - Ctop);
+    delta.y = max(max(t.r, t.g), t.b);
+
+    // We do the usual threshold:
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb;
+    t = abs(C - Cright);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Cbottom  = SMAASamplePoint(colorTex, offset[1].zw).rgb;
+    t = abs(C - Cbottom);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float3 Cleftleft  = SMAASamplePoint(colorTex, offset[2].xy).rgb;
+    t = abs(C - Cleftleft);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb;
+    t = abs(C - Ctoptop);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Depth Edge Detection
+ */
+float2 SMAADepthEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(depthTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex));
+    float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z));
+    float2 edges = step(SMAA_DEPTH_THRESHOLD, delta);
+
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    return edges;
+}
+
+//-----------------------------------------------------------------------------
+// Diagonal Search Functions
+
+#if !defined(SMAA_DISABLE_DIAG_DETECTION)
+
+/**
+ * Allows to decode two binary values from a bilinear-filtered access.
+ */
+float2 SMAADecodeDiagBilinearAccess(float2 e) {
+    // Bilinear access for fetching 'e' have a 0.25 offset, and we are
+    // interested in the R and G edges:
+    //
+    // +---G---+-------+
+    // |   x o R   x   |
+    // +-------+-------+
+    //
+    // Then, if one of these edge is enabled:
+    //   Red:   (0.75 * X + 0.25 * 1) => 0.25 or 1.0
+    //   Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0
+    //
+    // This function will unpack the values (mad + mul + round):
+    // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1
+    e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75);
+    return round(e);
+}
+
+float4 SMAADecodeDiagBilinearAccess(float4 e) {
+    e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75);
+    return round(e);
+}
+
+/**
+ * These functions allows to perform diagonal pattern searches.
+ */
+float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+
+        // @SearchDiag2Optimization
+        // Fetch both edges at once using bilinear filtering:
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        e = SMAADecodeDiagBilinearAccess(e);
+
+        // Non-optimized version:
+        // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g;
+        // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r;
+
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+/** 
+ * Similar to SMAAArea, this calculates the area corresponding to a certain
+ * diagonal distance and crossing edges 'e'.
+ */
+float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) {
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist);
+
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Diagonal areas are on the second half of the texture:
+    texcoord.x += 0.5;
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset;
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+/**
+ * This searches for diagonal patterns and returns the corresponding weights.
+ */
+float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) {
+    float2 weights = float2(0.0, 0.0);
+
+    // Search for the line ends:
+    float4 d;
+    float2 end;
+    if (e.r > 0.0) {
+        d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0,  1.0), end);
+        d.x += float(end.y > 0.9);
+    } else
+        d.xz = float2(0.0, 0.0);
+    d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).rg;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).rg;
+        c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw);
+
+        // Non-optimized version:
+        // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        // float4 c;
+        // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0,  0)).r;
+        // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).g;
+        // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r;
+
+        // Merge crossing edges at each side into a single value:
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z);
+    }
+
+    // Search for the line ends:
+    d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end);
+    if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) {
+        d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end);
+        d.y += float(end.y > 0.9);
+    } else
+        d.yw = float2(0.0, 0.0);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.x  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        c.y  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).gr;
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr;
+    }
+
+    return weights;
+}
+#endif
+
+//-----------------------------------------------------------------------------
+// Horizontal/Vertical Search Functions
+
+/**
+ * This allows to determine how much length should we add in the last step
+ * of the searches. It takes the bilinearly interpolated edge (see 
+ * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and
+ * crossing edges are active.
+ */
+float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) {
+    // The texture is flipped vertically, with left and right cases taking half
+    // of the space horizontally:
+    float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0);
+    float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0);
+
+    // Scale and bias to access texel centers:
+    scale += float2(-1.0,  1.0);
+    bias  += float2( 0.5, -0.5);
+
+    // Convert from pixel coordinates to texcoords:
+    // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped)
+    scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+    bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+
+    // Lookup the search texture:
+    return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias)));
+}
+
+/**
+ * Horizontal/vertical search functions for the 2nd pass.
+ */
+float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    /**
+     * @PSEUDO_GATHER4
+     * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to
+     * sample between edge, thus fetching four edges in a row.
+     * Sampling with different offsets in each direction allows to disambiguate
+     * which edges are active from the four fetched ones.
+     */
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x > end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+
+    // Non-optimized version:
+    // We correct the previous (-0.25, -0.125) offset we applied:
+    // texcoord.x += 0.25 * SMAA_RT_METRICS.x;
+
+    // The searches are bias by 1, so adjust the coords accordingly:
+    // texcoord.x += SMAA_RT_METRICS.x;
+
+    // Disambiguate the length added by the last step:
+    // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step
+    // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0);
+    // return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x < end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y > end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y < end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+/** 
+ * Ok, we have the distance and both crossing edges. So, what are the areas
+ * at each side of current edge?
+ */
+float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) {
+    // Rounding prevents precision errors of bilinear filtering:
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist);
+    
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y);
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+//-----------------------------------------------------------------------------
+// Corner Detection Functions
+
+void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line.
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0,  1)).r;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1,  1)).r;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y;
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Blending Weight Calculation Pixel Shader (Second Pass)
+
+float4 SMAABlendingWeightCalculationPS(float2 texcoord,
+                                       float2 pixcoord,
+                                       float4 offset[3],
+                                       SMAATexture2D(edgesTex),
+                                       SMAATexture2D(areaTex),
+                                       SMAATexture2D(searchTex),
+                                       float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES.
+    float4 weights = float4(0.0, 0.0, 0.0, 0.0);
+
+    float2 e = SMAASample(edgesTex, texcoord).rg;
+
+    SMAA_BRANCH
+    if (e.g > 0.0) { // Edge at north
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        // Diagonals have both north and west edges, so searching for them in
+        // one of the boundaries is enough.
+        weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices);
+
+        // We give priority to diagonals, so if we find a diagonal we skip 
+        // horizontal/vertical processing.
+        SMAA_BRANCH
+        if (weights.r == -weights.g) { // weights.r + weights.g == 0.0
+        #endif
+
+        float2 d;
+
+        // Find the distance to the left:
+        float3 coords;
+        coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x);
+        coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET)
+        d.x = coords.x;
+
+        // Now fetch the left crossing edges, two at a time using bilinear
+        // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to
+        // discern what value each edge has:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r;
+
+        // Find the distance to the right:
+        coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units (doing this here allow to
+        // better interleave arithmetic and memory accesses):
+        d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the right crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r;
+
+        // Ok, we know how this pattern looks like, now it is time for getting
+        // the actual area:
+        weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y);
+
+        // Fix corners:
+        coords.y = texcoord.y;
+        SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d);
+
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        } else
+            e.r = 0.0; // Skip vertical processing.
+        #endif
+    }
+
+    SMAA_BRANCH
+    if (e.r > 0.0) { // Edge at west
+        float2 d;
+
+        // Find the distance to the top:
+        float3 coords;
+        coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z);
+        coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x;
+        d.x = coords.y;
+
+        // Fetch the top crossing edges:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g;
+
+        // Find the distance to the bottom:
+        coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units:
+        d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed 
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the bottom crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g;
+
+        // Get the area for this direction:
+        weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x);
+
+        // Fix corners:
+        coords.x = texcoord.x;
+        SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d);
+    }
+
+    return weights;
+}
+
+//-----------------------------------------------------------------------------
+// Neighborhood Blending Pixel Shader (Third Pass)
+
+float4 SMAANeighborhoodBlendingPS(float2 texcoord,
+                                  float4 offset,
+                                  SMAATexture2D(colorTex),
+                                  SMAATexture2D(blendTex)
+                                  #if SMAA_REPROJECTION
+                                  , SMAATexture2D(velocityTex)
+                                  #endif
+                                  ) {
+    // Fetch the blending weights for current pixel:
+    float4 a;
+    a.x = SMAASample(blendTex, offset.xy).a; // Right
+    a.y = SMAASample(blendTex, offset.zw).g; // Top
+    a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left
+
+    // Is there any blending weight with a value greater than 0.0?
+    SMAA_BRANCH
+    if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) {
+        float4 color = SMAASampleLevelZero(colorTex, texcoord);
+
+        #if SMAA_REPROJECTION
+        float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    } else {
+        bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical)
+
+        // Calculate the blending offsets:
+        float4 blendingOffset = float4(0.0, a.y, 0.0, a.w);
+        float2 blendingWeight = a.yw;
+        SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0));
+        SMAAMovc(bool2(h, h), blendingWeight, a.xz);
+        blendingWeight /= dot(blendingWeight, float2(1.0, 1.0));
+
+        // Calculate the texture coordinates:
+        float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy);
+
+        // We exploit bilinear filtering to mix current pixel with the chosen
+        // neighbor:
+        float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy);
+        color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw);
+
+        #if SMAA_REPROJECTION
+        // Antialias velocity for proper reprojection in a later stage:
+        float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy));
+        velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    }
+}
+
+//-----------------------------------------------------------------------------
+// Temporal Resolve Pixel Shader (Optional Pass)
+
+float4 SMAAResolvePS(float2 texcoord,
+                     SMAATexture2D(currentColorTex),
+                     SMAATexture2D(previousColorTex)
+                     #if SMAA_REPROJECTION
+                     , SMAATexture2D(velocityTex)
+                     #endif
+                     ) {
+    #if SMAA_REPROJECTION
+    // Velocity is assumed to be calculated for motion blur, so we need to
+    // inverse it for reprojection:
+    float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg);
+
+    // Fetch current pixel:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+
+    // Reproject current coordinates and fetch previous pixel:
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity);
+
+    // Attenuate the previous pixel if the velocity is different:
+    float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0;
+    float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE);
+
+    // Blend the pixels according to the calculated weight:
+    return lerp(current, previous, weight);
+    #else
+    // Just blend the pixels:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord);
+    return lerp(current, previous, 0.5);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Separate Multisamples Pixel Shader (Optional Pass)
+
+#ifdef SMAALoad
+void SMAASeparatePS(float4 position,
+                    float2 texcoord,
+                    out float4 target0,
+                    out float4 target1,
+                    SMAATexture2DMS2(colorTexMS)) {
+    int2 pos = int2(position.xy);
+    target0 = SMAALoad(colorTexMS, pos, 0);
+    target1 = SMAALoad(colorTexMS, pos, 1);
+}
+#endif
+
+//-----------------------------------------------------------------------------
+#endif // SMAA_INCLUDE_PS
+
+layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput;
+
+layout(binding = 1, set = 2) uniform sampler2D inputImg;
+layout(binding = 3, set = 2) uniform sampler2D samplerArea;
+layout(binding = 4, set = 2) uniform sampler2D samplerSearch;
+layout( binding = 2 ) uniform invResolution
+{
+    vec2 invResolution_data;
+};
+
+void main() {
+  ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+  for(int i = 0; i < 4; i++)
+  {
+      for(int j = 0; j < 4; j++)
+      {
+        ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+        vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data;
+        vec2 pixCoord;
+        vec4 offset[3];
+
+        SMAABlendingWeightCalculationVS( coord, pixCoord, offset);
+
+        vec4 oColor = SMAABlendingWeightCalculationPS(coord, pixCoord, offset, inputImg, samplerArea, samplerSearch, ivec4(0));
+
+        imageStore(imgOutput,  texelCoord, oColor);
+    }
+  }
+}
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv
new file mode 100644
index 0000000000000000000000000000000000000000..8efa011f77f3c49ed762e7465ae685ebd01b8348
GIT binary patch
literal 33728
zcmai-2b@*a`L(Yw17h#JjlCDFSOGymQBdq9IKqG^G6(}2D=4N|5-TP#i7kn-L}N78
zSYnDPc1_b5jnNca>>6YFp6AY4%$^hfzi(%D_FC(G-`)2<=UztW+h?8rjlK&t7HKTp
zIH<MJOiMTVG!_Ca+UVCTM~ogfdh?lG6F1*x+bwliqS3eMr_Yl37}i2Qu6@ch4WB}m
zLRVXwyi}F-@5}%CP~He>#iNXCr4Ra!+k426y*p=5*l_ky19zG|sXL?Vzc}TD&S?`V
z;H~tf(U{#nV@mr`Q#-n~7W$+%b(Vr2t5RC$(5W5MCQg|)`Jj#|laKD&y?yG0SyS7)
zrgTm_aNLGzX7;3Q4%_;Ootky8L|j+Li4!_IXDn!MWwhy2O0A`_3R>r+Ni#dTY;V!V
znpJzlIwo{Z>=-tsee%#LQ>RSpXrD1;!i0{QGn<X!)5=_|LD}8UMH}n>|KqmVsft^t
zh}*o18%GNhj<&t6Hk>(ds|8c%*l=l%X=CFiPVShwf5(Z@HUab=XWz!AXcMQ*X!db4
zW5|we8%pQ0r>%kiXIpNTdRtqe)!Q0~mbSJwhTO`I+?mq0hRo<_cT~L^nzU6lrk2JI
z__gEXklZ7K;1j3JWHq|yct^_aI$IhB^{mwrjbU)x$ZlvZSN8^)*=(=(L{{_P+StuH
zd(C4@V=!7zKUy0@;PtT%HICfc=-=3#a^|d~X0{(UeQL)jwkDecsX6vNDSPI5bjP&G
zT}Lk%|AMi%Hbz)SbKM<%_jpHAX1t@oJ>y-Zu`j%*-h)SXOzP@2Osn=>?QI>s#@E`|
z2d#Hot&K5g^)Ze$mLuwZIS5>|x1T5Pe;UNcHpdxPYjc0JdYcEJ)!RJK*k+rHG>!oG
zw0Xz@(?6p@+dIUVT6>40)!RD^t=`_@##Zf302l2I>pWrF|Fmd(<Bh4ccO+W9y>_&E
zdq)}DY;Qp0Sg`GRE}9K_z*)HNALp{AF$tfZz1Y&23@^{b(bmUd^bVRL<`5cp47g{%
zEZLX=x2<6vT^$p;Mt07aa#H8CuJ)<Bcg~pBF=K3dS69c3?$LJNpxZQWo$mGYVbFWL
zEsajJ6Pow$OxkO0Oo!LUdOVsJZYwvrd%CCn13PAPO_}ijZ@s&HFMxXcv(U=+XT$65
zpMcig{xXd-z_$N?K0C(#&$GiDBbmoZ_jx+`f7)qnoPt(=mYj;#GykoP)8O?noQ~GB
zwyljh@ZNi<e`BuNXWlLS8kb_*d-(qQjoN+uL8FH4?{&dk_iKE;M;|$S)QFLuJ@>?`
zlz9lcU*3T`x7s%1IbPa!fXCJjp58vA{kW!YYvWjLy`HBnjkD2u)}wFZ0(ke__ibDR
z?%wk)jf=rOb+t6+*Z8G1enXA>zEQ^CRO9!-o8#aHc8pKSt1-UP%eQai4Q&`5|3|%S
zEsc+B{L>om)2DYoT5G(2jW1E-OV#+&HNIAjuU+Ho)c9`ja@_mLo8#tQ5Le^us>RK!
z@e^wNq#8f9#!s*DFT-bYvh&<J4}R2?_Ud`qw{a~zwnxA{XQZX^7`$BDCwlR|jpvKl
zh5Gi6?E^1kTYB-njb-3zcbi)5w!M6L59!7GHij3mlWMV(d--zj_TsIL6XCtjN^9eM
zw5dF>n-9_IOkILDqjOgCy%wptC$B`C)IOoBlgryX_Sw&fd*L(UUh5Ut(pbNx_u1SK
zp1BwZF6V8V8sDzQhr@gJSxaM&ntjh+yl-Pa_{3@-9tiG!R<$$^@8#FhII_lP!e<^o
zqig)c?sGgFo@d)BHQ&=}d`^v@QRC;;`1v({QH@^(FUNd!&3;WU-nVfRd}1}`TfiCf
zoxS{88u!%rWAJj!Pa0c2&;L~OeWu2rt?}n-{N);dt;XNz#rrlsf=}(7z^+@k`8ja%
z)bS&xb{^F}b=0)kofDd$wnh!B#<*B(qxtUPeY-r^cMtdL>R|SM2QL}TPhQ@$8=y7k
zvgy;Wv5B$MX7ATAvvcY!Kf|H-qfK>FFWdNu?OpBN`(d-nZaV|PJb$ZJw6`?2Ml0`@
zZS-023~Om@SL55)_)f;}-Z?dAZ)f<FY13zQjXKU(u;%!7LF<0$pW!PUTEE7wJ=(bD
z>wMF<rLh}&+5V6k-@V3%_u{RMQSd3pO&&9=i#{~#?AO>A&3BsdWA__AZutJ=M~^v(
zKJ;(wSJU?%K5Wzhd=St!u4Ws_`$6}3`ZxBk*$x=J|9(R-j$l50*WeKd>gK*<$O&oM
z{jOk9@M`3ds9C>t;HDAm4K=RtroZdAcGJ?s+u*S~-`Y#A-8Suyb6?u!+PV6gcGsb_
z%eANf&ZqYH%e9-V-?_CdliWF#o8``<IQ?_(<f-pKxbv|xd0D*lYs0Etyd3}Ek8(xo
z>Q5<FUkko4rJ8vKA_g?|Zr??~^7z_@e(G)1vRGxa4sB-JSccL#$G<%PzbxfiWYZ=U
zUk+^lR)a53X^fhET7lAb)NN-)O8byDw7226I;DM4H?ED+IP22B_5$qO3}~zmw_gj9
z%M`vUJ=y?+e%7~P<)qd|%^OkLo_hL{wzosGZR^)}d&;`6K5fkSu7$0v%l<pwJt(aw
zw(hw8jnReP+qQ7?8C$kzOtSs)ecCxpoNd|nHhOsyrE``stpL_9S>LSNJg~|6Ig8RU
zyPsk|54=#t9m9NbqcLZ<f%wZk+l|H>!@@n=jm8&;xlg_gT8o@``Q>Dj^&9`4CYy6_
z_;;JUv4DTC$*<ULQ+no@t|1$vpEgTlt@{SDZN>Lyu=dhFzPF(}e`)t_FzJEtpLFw~
zsp}rNKI$ps`*73uioxNJH~I7?_pH1`F57y&X`geSXX`Dn^WfQW4c?}74b)Q~U(J%9
zUr_%5_@qgbc8h)WroG3%t(!AveuKtc1y?gCX7i@coL$4GHu)7p5_23}%^aWO!4E(5
z&`#kq;8!m3@{Wm_3D-wGKC{3NKQwwl%xw6TOB}Nx<^;Gt>WMiOyxyo~c1^wKz#XGG
z+J#`}(ft_hdNjFu>be_^G}yWRBiMbCwGjUa?NgF<Xn&fl=01(~BHX<d%{hG&&AQEL
ze{m|dZF96W(d<+DX&Z;5S(ojkpJ&^tM$6FfbGrEhspnj{KI-xLO4DZ^d$fC<zY15=
zFFuzveH!=0XFlA0pdO#EH+|;p9R7`NzDr^*gX^On?K-gQ;kXia2ig&zj9aiTe*q`m
zANw!6`JB}8AY32y_&f^kGq)$^DR`qXc5q@oggfWuURajHV?Ro5Gc@b9{=|+))5rL<
z_jwOqW6pwoe-d2H9G{cBeFlfmZ}P#-^W}P819v^$Betb>E1KMFoO|u3XrB~&?SU#z
zJ#mj#aiz9sOLuJcpY^X)Y29sh-;EofIleM`r~DV)JH<H9!N+j#C7Nnz{{)Vysqa*A
z-^tXpGo&Uq#<4VG(Xce_i-N7+F=njGR`xY$#dkN)jCE}ac1hK2OYZxen)b~rtiN`j
zS-#Ji!hK&Wx$kEt-w8&5y-ycvDn0*a{ps(>%3ojWl=}{6@;T->e1|iI`wmxf-{DH`
zJDe%=a{$~w$J&&qsq^hSotoKa>UK<O`edAI@c|0vW}jR$KSw#Q_R%@^vr@_ZycDkg
z>IL`N@8_l1<$l%)Uje>#!L|E2Cw94?bHerab53|0+|N1T`ujPh<bF;GH{Q=F;l}$p
zMLxt)^Rp1yeK!wn7sl*9>r?6TaFDma)%uc8sn)6$te$%Mfvv}RPCfn6)KkyGV70QI
z0dV!PwB=b@1Z>^vyRt&Ahnm>6@I3y9HCr6)+F8CwpO&Otifq!pbY-`+Kklz(z}9gy
zZTY^n99YeB^C3R+CCJug-D2By&C~7*mCe$2SE5{*Y_i=|D!XOcT@`E{X?HcSTH2Mb
zNVYE96+5r)zct9tLDt7{tXb))e=WG<nnR5CHn2YGsb_7l^<;hYTL(=&^{flFpQ&d(
zus-TppY_4kug$q}|EpPt>$f3TtrPBjvJqHqE`5`43|3nkU;F0Wvjy4v$T_wi{kE+1
zFI90{!Hre7UC)_X{I><GY4c3U2eH#V?>m53rBwHPd8QIKsM3t{tje|dxoJnRzS=yu
z^27}Wr#;W0Jhq`=>oU&sC{LZkz}BU0cS^Z<IHk{$L41PnoqG@X8I;y-9r}7Fdk=f}
z>TjI)fxgy1f^tvtsEWHDdx2dK?bbUQY|I3>zS`B?yY6H6pQZlpk-fp?p4bPj<}=Xt
zwW%4Ov*A9LdmgRNGWCrCmwR<ATrKr!Q#0PPZM^64W1cQMkaLH7&zlw_Z%;oKrd+(j
z-iu2V*xyUAo~0?>b7xV0fvo>LvVHSD_##>Ted_x%rOA6i{Yzvu>k(Vec9dt6tv~hn
z96YDe<8v<D{^;YJ$@OuJsnarj`wG~;<*eGbuaed6+Xa*+`=)+AS<SwQ)9!^}`<8m_
z+eKvc_*@LOZ~E9bxjyzSbz0Wv-}7Yhdj;p;IbW&3evhK=Gsxf5uy20X<Xn5cFCpvi
z`L<tQBhM%M**4lYD$UQgV>!$2-^<AQYSUNS7;M)0O|o{g=icZ0S=6uXTV&hR$8p*}
zwX}5wxZeJil~%TY6`H=<dfFe*?Az67+RY}8Uo%-|+}{T~?lFwo@mxz*Gn>ko#n{#Q
zTvzD>DQ)8#vT?3q8|C$6=h*wxvB{0e*nR-6kL|`vE5~*dn!eiVWBVbRcC)GMXU6s;
z^m1&sz}3vAGA3iYwbC7%ZQM*YE@S&K*s<k)kQ<Y+-43pg?T$(-$95;0zS`<zy9-Ub
zImy17(~o<>nPbPh4E#Q_diLM_m8NcT4<`Pn;EW^j51^?h{%2q{vu(S7_pQ#y&%yVT
zP0@Y<zMrgbv|oa)#}w^Bu$q23N52AFr*#;kPuAquV0~Q^=f>}0^v_s+19mL8k=<`{
zW6C>X5%`*9)9U2h5o>_G3)Z5vzU?S~OI9!M(1$8LK978c&!cGSp78;ckC7iHtDjkE
zPgMHjl#6l8>-Qwt@wJe(C;ln0?P^cEzXPXlj#aK-_WbWFyLCI}Kaj24tnCxVs+MP$
z`(|CTX>GFe(nh%s*?C!yGW+A{O3(gy2Cil{^&!Xi&tU6_@3UYvvq|5tFemo$FJ$e`
zmG5<G&yk%Yvva5RJXv3D8Q)*Qj?KAyo}Byj1+e|op1J!Q*gm<g*^4iN^))%4`lu!5
z?_gtIA}8h_U}J387=6?-_J4wnv5(I0zsUNUwYdgrSp&zmIoY%k*)`aZa^nj7EZd~O
zn-+L8u<Nu1rJoVoC|@P_ulU=P?^HZ{<TbeUWjy}|tC^y`0amlmIiqia_0{$|c|O_k
z+n%=Ef$vr}@3m;}Ra$umzK^D_Hhr~azkC4JZZ_GE^zA=j?WSlSg4MeF+pqcE>3`w+
zYD-_8i*CJ7<0G(hXX$6BPbfbnn|$A}GVPYx4-0`EulvFI^!)ZgQ@?|K+7I3WR`WaC
z_nDu5lqTPgw6&7etXrJA`-81}IdbY=7)|}Is_sR=YJUG~A5)LzyR5qHFABCD`>nri
z*tYt|$d1#t#kTVn<zm>Jo9vOr;hVxUze~W?OwpFCH080hY&}as^ws8^Tc2lOX}Dt>
z!#jk2%fQvlCJSb|dFN|i7C~*G8o+vu$y_Z5w!ZQ#FOO!N$$hDhT4GiJ8*?5xV^|R_
zZ5gAFT4GkJV#@Qsaus8LjnPNVv2K923fQxov9Aj69((tD;c9R-v&rIM*3Nla17f^B
z*6;UtITLGwwVO>Aw!ayrwyfP+V6}_cGw#oI;A(4=Gp=>P_Q}0w9diBLQ=S3$howB{
z#C?-7tyjgTZ|lR=%=Wu4*?UgCyyvz-H<kC?mhe1}w<@q}AAM`EecF~X_ng0TR^D?v
z6x=(rz2cddjp5dxwciA;W{S3Hr77p$+YF+ww({PyeQmkpwg9{5z00C)S!w0Hw-uVc
z+H&uu{efuOO?eO48t$LfXv_L-1GatlzvFYAw}q?Et=4@zu$u9S+a8=a*Ilk(`a1|*
zAH$B7R*qpOG<~&Y4A~1iqiHvrEHb`bAlgmQcC9q!+#|a|^wpNT+x1m<PY$MZKUjKq
z52f6lY|=igvRh{C!@-Wdyu0^+tLN?>0anZ1Jqm2{{?s;-tY+Kd)V(Lzy34zJFSvT{
z?$Kbi0c87_dMtBy?+vyc`>nri*tYr*vg5REvFq+#xewU6$vz(g_UvSS$HLW2(e|x0
z<;?kh5Ph|kclS8BW6RyWKUmFdvMBHF0}#~ggMG7JV=`9<f~~K-yAOgJXL4Waqn4P1
z!N!z#_aX4KWsE**i8&N(OnG-71~(>mw?1mlop<-)VDBUOGL%QayT{&rcYhA9W;R*m
z?j8>@UZ48ieI%N8v&q6fd#`EB+8qT})8_ppAH;d~9dQD9O-l8|swH+J*x20r9dNa^
z$(fT$V8@etU#_3~(evZpvMkToP<ZBJGPs<Nqv2}i^4?c3@BN+WmudTE?tZ`Bp~BwP
zg9^N3g}0;JiEN*CX=2`CD37IX@BNH>8r+!hPPp~?`O$fs4(F%2ACCj*ua9$RUD^}d
z1y)P!iC})3u`>bMPar2&d*=4@U}MWW`6Rgd-0Ds~8LZYuPTVQr#96OgKl|jpej3<$
z$n)TI_^D*|)G-Ha9iETG&4sI{?K8w=;}dr#IB~Wu*UvgU5A(pTQ|9Na%0GMl3vhkZ
z>wEr-Xxhyt3*QOU%e^tIX?0F}KKtxe;K3Dk?S~Y2XocG-cPG0~hEq2Gtc>z(+R6KY
zJ{J_+x-P2mi)(y-jbB>hH`MrzHGWgU+i35;8h@(BUn#i#d!yhB!#^sx_K$1)(;D}8
z%rk!dTMO>l>R)i}{*HO+@9&tG+}|-T`C2vZ@0gc%f5$w$jr#o^^OE~J=Ha&Q@0o`?
zUVqm-T)V$(UUGleyyX6_dCC1<^Ki%O@0!cq=kASj$T<h+f`?)&&+d6>>Un;D1+2C@
z+0S+7Q<{7}Yx^qsdt~>aIA`V(@DTiL*F7c|yT2|Z5325|uYvs@z`d`ZcK2NB`Fh2T
zb??g4|8IcZ-{mv)GPre`jL}CeG2a9mQ$8QR1viGDW{f^+iTO6zw#$3wa=3ct=?bvg
z0P?kbF7yt&60Gifh4YvCt^%juj#J;O(bVTwcjkA%YSxjs?}8KO*yQ^8+;Ux91J@${
z*MQ@nccAa1xd%+vp^sW(t_9cU=Q=cF_-V%IqgJ+S`&qLefQMAR-WPJ)@~*g^oVEWU
zIBTz;cJGhWb92Rwb^hh)=a0blvwSzY1#X=tWAsr=%&lN!d=JQc{1|QwKg}3@)Dm+W
z*qHK++zwZd&mEOdxj*hiQ=eO{=Urgq)U#)PA|~HW_H5ltY4U7oyN9gix{A}@{os0^
zeu}1kdbOV(0IQ|_pMljjC#RmDgX{JD0!=+WzpQ-9u|0^U9-m)*hR?6j)YG5efYtJy
z@NdCt_9ve?9|GG?`(PXT<TKjCV9!zcIrEW9D?g(>il(o&p3i9Af6nt5ns&3PDw<{X
z+~bHtu;tH&o`9>FO;yn>^IiRui0Z!kI|kb^&OPY!PrLPRfc<x1?>f2f+P{bElh2WV
z0IQiz76(^<2K5ZYczt}w^G<o1oY+5u^)Z_)vPREBo+6ib<zL{&nCz=QYKeIcY)pB^
zo`)O5PcudzwT$nt;BtH~z{~Od4X$S9(L}2~<9i9sczrUy7r`0d-@*ErO&00vKOpwC
zyx0E;H^$`n^ifO9zre<n_rc3>WB6&t=%bePdKGMp^}RwV_p^#?`5Jg#O7-}^4%R>W
z>EB>AeOvKuA^UvxJ51};SD(bc1@4aT{w(%3T+M894&(by&G%innwg*GT<0DCy-IV>
zz6rKo{q4gWl-g78```=6S%(k6YUZqA{tV+kXalOe7$b8XKZNts{EVsH_}dx1<NhyL
z`#EIW{)ke|wvGJ+Y=5+UOeq(CN@+}fzr>C#g}?FoX-^#sVN=W4<QZFEa5=UXxSH8S
zSToM|N^PmLA2|K;9aEltwJ_LsbN8bDKHmqRsoMwNLFIW4E&}$m;$X7vG~Zb%)7GNk
zv+2lO=FB-+46bIJ^=VV{d3-O}=Y?hBm&7)NIDLKJm5Y7X^>=sEk7dBy;OF_(PrL8G
zsb|@W8+#G;%QNQXz|LcNzbubtohD=SQA^AUU}MTNyCRw~$jumi)a--z=Stw*FV4lv
z@FA4BBUgdzqpp2*D)J7=^L|xyeN1@{uLf7w*XQsWmEGsCw$;f4$<C`d^U(&jZO672
zST6Q6fxrKpd0iKrdDTz5pA%BgdKJ&S$}>mngPo)D4&DIGI!(stqn4Nr!N!#L+D2%`
zAU9+5QA^CmVAr&K&ToRIo_X37tmgU7JMLz1_4+&R=4k1+<Ma${fu=r}eJtM+tY#hQ
z$5!CPIX1a|?rqn_HE=EBzco1iS>tWcyc<l`p^sW(wyk2y`Pr_D$^7V}R<>*VsbhPv
z?YiGHz8%oi<1?u8Dd%)YH1)aF9P9)(PCfIyGg$p@>UTfx3O2bPwe3Pyb1uYbZ!oyt
zry*$Sc@G<^znS*^e7HNf9y1J0Ju$<<YKhqcoH@_fN1&<4XXI!2j6ze-``(^l`(ypa
zYtQ@MUf}xs-snmz-}m-L(^p&leQzH$?PgQ8C7NZ<-WY5{u;uwY7OrMCl`(nW+ZUU<
z&rZi+JI1-!{JgK-`n|8lfpcH^j<CO+oM-6)U^TPJdUKZ@h-SP#xyyXt%RWB{T_0mI
z?+3&8BfBr{k6gd>=MZq&pF`#3^ye_Jn%PwL=Lj_8^+|sY2d6)u1M6c-f5yY@k7vOC
z$n{Hqjs%zeX_u4JpQFHPW`3xlXMQFi7_U$I<6WQFiRk*69fv9X>Oiw!<+EoJ+!&L6
z)kiHclflN6_r}q1W31m8ebgME>wOH^nDpsbu$tLq5#Onh()T#Hn%Vjtd*0usRa)NP
ztXF^gFojZk>g@!3Uekx^a5XbO&3g0xc05A;{cQ%C?fYj0+Ktcq+f1<byuWpU)oj}}
znGLo-_G=cUTzmp$`Tlky-1j%*_0yg@K3{1Wn>=GX8C;I-6u6q%lyx}?t}S(*22Q_D
z1<SL}bHKj8xz76gteT6aZXZskl;`<=1~~6;wqt$zWWLV?>tnV*ru1VTTKR5w7F^93
z+tQ}yb7)se?|jR|e*vtI-!Y5x9kYGjv)K~&GUfKv<?q*wsIcEHj;yfHfl*}Z-itEd
zC4ZUvd`AyIuj2Mg+qvY-&-q~M$UeFNuI3${-%q*_{#CLz>##nxjO8M*_0Fx{b1w#~
zUC8*-rds?j1zSh_zXn#DPfq=6@&6{+`qQ6pfz>V}r+&5ge;aK5@xL6b7JqGO`g_-3
z0d{^9e<j@Z!mk25=iyhwts`gpJ7D*RXWCf(^~s+*d>3pR`tj3zU(Mb7Jp}D0_kz6K
z3;TAPn*Z%>_QKw1!TVI$eKDrO?u)Tx_r-pc*$3BBclL!oj}+Ya#|p0fiHbYN#$HEu
zKVMI2-Io4&_75m;B%8F~RM{<aHh&1Vf8~4E&1mY*tNZ>(VEdi-saxRssQW(kV@i|n
zQ`&ANtJxQE+PV#Ff6Mo`+u`b|`%bXh9c25QdMxwj+joI&$8qRy8}{QTl<9{)cazoZ
zhdBMX2W&seG2aVUzm>D%cL(=@)%*-&KT?ln`f)$lcI=z}w&5J8-#~W$ZCmVIx~G2%
zcAsR=J^*+AvnPKB*GHY7X8XDKevY8sY;tdB4`|D~-mk#k6Xkb%zpk|MUGFz&`fBTW
z*X#a{;kRho%_iGQpC3ZA&pBrg!*k9a1RJZ5ebZ07bLd`lJ}l+gug+8IejM!D80)^5
zi{10Ch2KxOC!d6SAA8R9({6vX{SKV|JPOwLDYEr=wzOxieh;qC)gLOYoU1>g>8mYs
zmAm3kXxdGV!#+F>x2^2CXTbGy_GdKp^!r(G`fZ)op^x{Ew(|ZtsM#9t>6E#1#ua$~
z3VVMXP+{+m1Iezz!IZgwo~Q2e{#l3@qj2N<6x=#mD(-w5oBQXlVC%N@{`ni_i)54Q
z{ZeJO%)I;^Z2!u$_zyJo%;i79_B+2{^DnqQ>Yn>oC{3O{Z7-A6?29;Uy$ZI!<^A&-
zTs`;C>tMCqKdHwu=l<Vd+i@KF+lKvklQR9#=MA!&{Sc=gZ-MPcIp(+F>bZa30juTy
zNj;Y7$Gc$Lv2Xg@hI63)0@?YuZLxFd{qr8!eUkI`KHT}w{qq4_A9a3u-9P_9&~7%l
zSF;DS<xKq-JRe(m|9n(w<(c{zO<!#}Q@QItLDOzF*;e}eDVlxG{lnBRI5!_+Ggcq_
zrk{5ApL@~yu#{)NI!~#)FTT%LvF>}h^|<HT=tS<HR=D?%=S)BCImi9LImg~V`sV)e
zY-!J2^#|AIYT-&N=V}0&zS=TZxqlWx({9Rl42#0m9hdjfVqn|O9$Xw;KX*%@sb?HZ
zf-?^5vkrZ{kF=Hd(Gktod*4Tgpyh5lw8GvehgI0S<Z!ZU@;S=fNB%u&xqtNW?@9|d
z-oGm?T)Tf)n%s3V*8h&*^<9qAx-GqrR-jywY|_3`Ww*?{tPHk)<sGpKntJAPRj~ce
zov|8RA9e46H7HG<MQy8-)$EHnZLJBmzvVr!7F<1bw}I7iAEh44+yiTaZO3uwZyWYw
zUCQ)BpLNJ;_CuV0tOvFq<(SuptLHx20IZh#DD_yT9~**g$G+)r8_t3H@?_`Vw#Cln
z26S#Cu=^zEZ)3RgpZjPNxIXF}rC#^ZrfAyDCiiOgfVP~m&B66Ewne3tXKYI}eYNF`
z<v!X9O}p7-Tj}#aH2a+UXlr=RjemcHvHI9I{j|IP+>6eKWw~FSr_{Y2*tJPNwg;=3
zZNp@}o(XMY&qMjHGzjip<r&mZd(QZd;PlnIO5fa7o;&TC$DP3SdEB|u%6Z%cO<!%9
z$J|xBqG`|kc{l9_x2^2Q!QlEC9D=5veh&qw-_~gz`gk{KEAOTWv|%dWmBzz!Hyv4F
z@0Ruod#@Ztc6}yN=588J-R0dgso=&>F1U3ZQ*qbD*xXHffUVooyJ;llD6+}DuxDkr
z%)IOcwtwY09gU{$esM1M2HWqvEA0c<N8LMNETzeFscj5d&Ay1!*1ll-Ti#9k!PRp&
zjRULYZc06txfAvW+m7SV-!|;Wft2ZoJ_nH1?1wo0I0$S%$}t}dSI^ya2v{w5Q|hry
zKMn=kj(yYLHk<?X5oG7zw#Ck+chg~D_esv;;c(|acheDYebnoB)928%n@#T3>;Y{#
zQ%8d9XR5u@$}@Eon!eg{rgA4tK+|qE*;e{I5zRj5Zt8$*_dAjCU}N>MZ~AF>|G5{P
z4@-IWtMin)j|RIo>BkhXn%Oo?*6W$jCiXm(?@`CXy_=3C>!&?ud@4A7^={HPca!H%
zd*<;taD5)9Ra!ZZooM=M%RJ_8nvSO3HuD~JJY3x|d%k9XZ9DsOCfIo^_h%QHdd4ve
zoN-v6b?D<=r7d^WBGhv%+0;SK{WPh<&HJUo-YrKLcuIkf0lRinDRXa~NITwJ8OJGb
z<HJvdTSvZcJPododj1>B)4}Q|lk?wB&H+2N=A41EH|ByTqi5fpQTe&=P68XJk9)$t
zX;18#V70`~184qb0<@n%nON-^$5~*<pY{0y+`7v3`666B>+>aW>OU1=9r~n>FN0Hu
zZOV<YP49`b!LCvIe@^9}d*WQUKI-**;yg6%W|RGJU#Mqa7&Aj7={R!s#k2~$FFGsS
zMmfFEj|aOKW>RJ^oKM~Eh44#i{K|@FJQu*NC;Y;S=iFZeS2NpxQ+`+F;!4{R&3g28
z4()@s)OjgbEqm>2V72*V{q3V#*5vD8{mbXmH{j~=xvcUj*Z!Mm>T~&?&-45(uyN|?
z%eTR{m$Ps=SU<D(jpy_VuzuRoKQ*!Qw<e30eq05&zOt`Z!_{rmJ@g&0>nUG`^1E=?
zF?0MqxSHAIJm=3fzmI0TKK9M$;5FdfYuAGHG3EFFuY=nMZRzLrVEbP7^9Hzje$Vg+
zU^U|tcOy7)j$f`{>^FfOf4L4ngsW%#H-pRZ{|H`={}#BK+4DfLVEjLZ7_U#pe=9iS
zzYVO9DdWE#ZXdLzpLc*W4*TW)xD!o1<G%~6W_;rA1}Dz(%k_)>C*b<}--D)}@!tzB
z$A2HZ9RK}rHM6N)|DT~5uTRGR0663SIanW4#{UbrebAPE{u1ms%5{1WuAcG#3an;)
z;(iTIoa2}47yEC(_4WTPntI0n5V#!w!|-zakHFQ;_4R)Y&3Jt>{zt(X|KnhN%JDw|
zw-4IV&nLkdN7nx-H1&-CcVIQ+6Zd;?;vBzRKkKkRf2i!aPyPt^jGCkU3C;HxbDn!o
zqq)wTRCO4qU+R4Z?7g2qPx~`m%{cp^O|ASpES$R&$)@sI(gpXq;<IE{h1)1+7y1cc
zpCzBC%(LTJ>Mx%qFVy&J6;Gf40=J&<=PI6i?s>SH*|D4QEct7tZHZ<*`Z`Ygpe=R&
z4Xl=D$%|mM`DFd=qgvMQC9wYGz4v#xdVKy-`IKklpJ?iHt7pl-z{aVkFE4{_FYEjY
zSU+=~C9i_@)0Y0JiJialS@JsE`pUlk8?J7fK1<#JyPk5NC2zuA$IS6ta5b~ZdCs%s
zZ8YQc$ys_4hn%H%(DgAUcgeeO`=BlJ^&Z&1m;HPnuAZ~>0a(rW#Qg`HIL9y7FZK_?
zj=x-o|H9QX{*S=r_&<i1<NpM%X0G2QpQ0JBPsV>MIOAtZ8~T*vUkJ@UXiGo)fHMyJ
z<+G$OntI0H0#-9VajoFQIexi*vG)Vl*S|lSdd9ynxE%igcsc$>;A&=5x&Dix8Lv;q
z@827m@h=Y6ryTzhaQmPw{ah04ILdWe3QaxZUmC1teBzb?C(iN9^^1L3aDDxkLsQTA
zmj{>QUjbf@e?_>OxxW4@qZzMH#_!*en(?m!)~6i*s&M<DE&W^#oN;9RS4UIN_}2id
z8K1Z{!HIMHa{a8s{;XBmbDy-q>(7$4(dOf8&a-44H2dtc#5#=AFZHeq_TJC4WIec=
zarQ%-TKO#TZaIT&I*FX;$H@ggrNS#vo?77zDNif#=>?us;Wo;-WS=)@Qu@46Tc19Z
z&l~?8k&^pwiIm)bOC;R*?F#-je0afq_U%#d?cn}fB8k`Tza<i`{lJ22KfK`Dk1V+M
znFZHAyWrYSDY$n3U6HcAIW>Mp!HxId6^Xy&J-^`EFDkg>zpCKcuP(UlUsG`H{<|Wn
zU;8Zu*Y3YBQrhn+xc0{iZv2x4*Y3YDQpP`1<ImRka|Jive`6%^j^BS{BwYI&6?ad&
zmo^~h-q;B2zRa1}817!q=lD(F`lx$vc<%B{-4so`*@UTiZrq>Ra=teMtCjcS=5Y0V
zuet?T&HCS`pId=VeqX6=OR}10L!AC@4Yt3I&2u1+ZCkMIYV%CUwRwiO2hXkIJRkBw
zJh8lQb^s5fbY48C)@!_e8Pg!Jb2$%t?#UhD#+qzXAGO5n1UANbNX*V~V={O8s3m3>
zaN4o&K2vu^Q;*MXl}|apgVEIUo!$_zTE0IX3br0~=X)5X$@$i{JNYuQ^DVZmHtZw7
z-f5SUGnXUL)Uz)}fz`~WjBO9Nwjunsk>_A9uyx%=o=493Mx$$U4!w)y{+^^`-y7`j
zp}975$kwS(=3*bP_uD*d#><UKn`6MXc>}qeud(RbQqR6%>(R%0<od)uuCmX=o__C-
zrXHUIz}}tnsygKQ=)VElLExOdx8MiEZ8P(D2w2T*avpOo4@EOxpPb9=sl(8;o9w&y
zwVM5M4UYiV*YI;_>S=pCSk0XG1?%#z*OvKg2WJh-`8*0;n=waH%Cm+Ozz5Nftf6)4
zlX;s6b{_n_YUAa`#NJWZGxw9w)Z;T5?A&Ml<oe{h^P|D@so(FzqfM!_^1Jh6(Dc=&
zueQw5v1r=O&XL*vc!x|yv;FYnz`hqGb{bsGoa9+Gwi8Wz-eabNUHgG#+q7<d@_qF2
zm0cfwqtB>xiiSSUrF~LQzq-I`iJ1kqo;AtZjG0NPZjAf(1aP@;PlT8I_VaKxvvXp7
z`K)ykn(_KLR`2`rv)0Mz`j~9ncLlZd_f)X`(RK=@T$^{p>0tM|w$mu(vCRcrm$o^S
z^6Zr}<;0i!cOII${%26i^L#i9e5~z}?SpMur+%sP3*hz1^YF>J{36^~llAJOmY6Sr
zjj<nz`7+!X=g=5^)Dm+x*ymr__jAzH<8v<9^~qU~>*L(`Ec^=CcC$CX3RW|l()M|9
zZN_<DT>!S;{C6)Gg4N6><Ffwt{d{zNjCbuX2D|sPT|_DOjxgpD@C-`#ne7^@Ph#hT
z)icIR!S(t68k&A4+t5cXF<%GQ=ldII#$?R;sQKrC!zg_oUPhTZz6sutJg;g)?w&6F
zjgxOhX}o3PzXeXbV{;#W8%;g$Mwf%t@@{k`*yOWX+ZALr>lRyg8}{#j^Q^OO*ZR9?
z>RIdWfz`~WjQeW1w%qgI2j`ymUQS=HMc3y1UPCF*J%1he2z+zTTc<u5`}N?f$k~^2
zW76ggVB7RvV_wnb574!xo*Ti|GY=p8A=fANAA-|P`h7E+dVGEazKJ}qsza`i{@!`F
zg3CMa$8hV+Jl+OYGn<^pJgaX<GhQF}w(FSleFvKM%=;~1HT%_u?_J>f8s3ejp0<Ai
zRx?|_U&}ZTcfz%0KJNu*4VS|wb8#QKHe>FglxGd^2T#B^YiOPNWZr%Xb{_JqmKziM
z1BE^9{R~Y#K0gOL_t^(>eezuX1-SmK{$-_=&*}%!^wm~>R{si3yV?0M+aK?tU!&Q6
z_;0}Fv--DiHM8qv&a?U<H0^m-KV10?#K$(RTc12%AF1rtp>Om@E1jaDk85F{)YGrW
z!D@+l0&G1#tF;;P7^S)~?%Sup<-YwLyxh0HhpU-Q?w>rX|A1z^K6zGO!^CCp{t;at
z`(WEog4NRBr@{6|+n*@q+Pvfb40f+;dxlaT+h4%erR`ZtdG^Zl;JL)lBgg-*XzKbu
zM=8&<`UP;F)%L-*tW&?#`8V+TWcP5!_9EO^llAJOmYA2o#(0hr^LMy0nGb!`67vtR
z&+2*P^!=Y`>hbv(*!7uLwJX<0fA8;Cz_y#S|0-C`T%P@x;o6LQi@4Xp)|+Sbzrku|
zlYKT>hvR(>T_59J`!~VvJ#BAL%JZy#3+$h-xX*0YSbY-vHdsAld<X0p%K3g5uAj*^
p^ifO9dthVA`F<a6Oy*l3HJ`OtQu;i!%-nqdwhn*SK<w`t{67JuZruO?

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl
new file mode 100644
index 0000000000..668b97d5dc
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl
@@ -0,0 +1,1402 @@
+#version 430 core 
+#define SMAA_GLSL_4 1 
+
+layout (constant_id = 0) const int SMAA_PRESET_LOW = 0;
+layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0;
+layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0;
+layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0;
+layout (constant_id = 4) const float METRIC_WIDTH = 1920.0;
+layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0;
+
+#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT)
+
+layout (local_size_x = 16, local_size_y = 16) in;
+/**
+ * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com)
+ * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com)
+ * Copyright (C) 2013 Belen Masia (bmasia@unizar.es)
+ * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com)
+ * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to
+ * do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software. As clarification, there
+ * is no requirement that the copyright notice and permission be included in
+ * binary distributions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+/**
+ *                  _______  ___  ___       ___           ___
+ *                 /       ||   \/   |     /   \         /   \
+ *                |   (---- |  \  /  |    /  ^  \       /  ^  \
+ *                 \   \    |  |\/|  |   /  /_\  \     /  /_\  \
+ *              ----)   |   |  |  |  |  /  _____  \   /  _____  \
+ *             |_______/    |__|  |__| /__/     \__\ /__/     \__\
+ * 
+ *                               E N H A N C E D
+ *       S U B P I X E L   M O R P H O L O G I C A L   A N T I A L I A S I N G
+ *
+ *                         http://www.iryoku.com/smaa/
+ *
+ * Hi, welcome aboard!
+ * 
+ * Here you'll find instructions to get the shader up and running as fast as
+ * possible.
+ *
+ * IMPORTANTE NOTICE: when updating, remember to update both this file and the
+ * precomputed textures! They may change from version to version.
+ *
+ * The shader has three passes, chained together as follows:
+ *
+ *                           |input|------------------ 
+ *                              v                     |
+ *                    [ SMAA*EdgeDetection ]          |
+ *                              v                     |
+ *                          |edgesTex|                |
+ *                              v                     |
+ *              [ SMAABlendingWeightCalculation ]     |
+ *                              v                     |
+ *                          |blendTex|                |
+ *                              v                     |
+ *                [ SMAANeighborhoodBlending ] <------ 
+ *                              v
+ *                           |output|
+ *
+ * Note that each [pass] has its own vertex and pixel shader. Remember to use
+ * oversized triangles instead of quads to avoid overshading along the
+ * diagonal.
+ *
+ * You've three edge detection methods to choose from: luma, color or depth.
+ * They represent different quality/performance and anti-aliasing/sharpness
+ * tradeoffs, so our recommendation is for you to choose the one that best
+ * suits your particular scenario:
+ *
+ * - Depth edge detection is usually the fastest but it may miss some edges.
+ *
+ * - Luma edge detection is usually more expensive than depth edge detection,
+ *   but catches visible edges that depth edge detection can miss.
+ *
+ * - Color edge detection is usually the most expensive one but catches
+ *   chroma-only edges.
+ *
+ * For quickstarters: just use luma edge detection.
+ *
+ * The general advice is to not rush the integration process and ensure each
+ * step is done correctly (don't try to integrate SMAA T2x with predicated edge
+ * detection from the start!). Ok then, let's go!
+ *
+ *  1. The first step is to create two RGBA temporal render targets for holding
+ *     |edgesTex| and |blendTex|.
+ *
+ *     In DX10 or DX11, you can use a RG render target for the edges texture.
+ *     In the case of NVIDIA GPUs, using RG render targets seems to actually be
+ *     slower.
+ *
+ *     On the Xbox 360, you can use the same render target for resolving both
+ *     |edgesTex| and |blendTex|, as they aren't needed simultaneously.
+ *
+ *  2. Both temporal render targets |edgesTex| and |blendTex| must be cleared
+ *     each frame. Do not forget to clear the alpha channel!
+ *
+ *  3. The next step is loading the two supporting precalculated textures,
+ *     'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as
+ *     C++ headers, and also as regular DDS files. They'll be needed for the
+ *     'SMAABlendingWeightCalculation' pass.
+ *
+ *     If you use the C++ headers, be sure to load them in the format specified
+ *     inside of them.
+ *
+ *     You can also compress 'areaTex' and 'searchTex' using BC5 and BC4
+ *     respectively, if you have that option in your content processor pipeline.
+ *     When compressing then, you get a non-perceptible quality decrease, and a
+ *     marginal performance increase.
+ *
+ *  4. All samplers must be set to linear filtering and clamp.
+ *
+ *     After you get the technique working, remember that 64-bit inputs have
+ *     half-rate linear filtering on GCN.
+ *
+ *     If SMAA is applied to 64-bit color buffers, switching to point filtering
+ *     when accesing them will increase the performance. Search for
+ *     'SMAASamplePoint' to see which textures may benefit from point
+ *     filtering, and where (which is basically the color input in the edge
+ *     detection and resolve passes).
+ *
+ *  5. All texture reads and buffer writes must be non-sRGB, with the exception
+ *     of the input read and the output write in
+ *     'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in
+ *     this last pass are not possible, the technique will work anyway, but
+ *     will perform antialiasing in gamma space.
+ *
+ *     IMPORTANT: for best results the input read for the color/luma edge 
+ *     detection should *NOT* be sRGB.
+ *
+ *  6. Before including SMAA.h you'll have to setup the render target metrics,
+ *     the target and any optional configuration defines. Optionally you can
+ *     use a preset.
+ *
+ *     You have the following targets available: 
+ *         SMAA_HLSL_3
+ *         SMAA_HLSL_4
+ *         SMAA_HLSL_4_1
+ *         SMAA_GLSL_3 *
+ *         SMAA_GLSL_4 *
+ *
+ *         * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below).
+ *
+ *     And four presets:
+ *         SMAA_PRESET_LOW          (%60 of the quality)
+ *         SMAA_PRESET_MEDIUM       (%80 of the quality)
+ *         SMAA_PRESET_HIGH         (%95 of the quality)
+ *         SMAA_PRESET_ULTRA        (%99 of the quality)
+ *
+ *     For example:
+ *         #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0)
+ *         #define SMAA_HLSL_4
+ *         #define SMAA_PRESET_HIGH
+ *         #include "SMAA.h"
+ *
+ *     Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a
+ *     uniform variable. The code is designed to minimize the impact of not
+ *     using a constant value, but it is still better to hardcode it.
+ *
+ *     Depending on how you encoded 'areaTex' and 'searchTex', you may have to
+ *     add (and customize) the following defines before including SMAA.h:
+ *          #define SMAA_AREATEX_SELECT(sample) sample.rg
+ *          #define SMAA_SEARCHTEX_SELECT(sample) sample.r
+ *
+ *     If your engine is already using porting macros, you can define
+ *     SMAA_CUSTOM_SL, and define the porting functions by yourself.
+ *
+ *  7. Then, you'll have to setup the passes as indicated in the scheme above.
+ *     You can take a look into SMAA.fx, to see how we did it for our demo.
+ *     Checkout the function wrappers, you may want to copy-paste them!
+ *
+ *  8. It's recommended to validate the produced |edgesTex| and |blendTex|.
+ *     You can use a screenshot from your engine to compare the |edgesTex|
+ *     and |blendTex| produced inside of the engine with the results obtained
+ *     with the reference demo.
+ *
+ *  9. After you get the last pass to work, it's time to optimize. You'll have
+ *     to initialize a stencil buffer in the first pass (discard is already in
+ *     the code), then mask execution by using it the second pass. The last
+ *     pass should be executed in all pixels.
+ *
+ *
+ * After this point you can choose to enable predicated thresholding,
+ * temporal supersampling and motion blur integration:
+ *
+ * a) If you want to use predicated thresholding, take a look into
+ *    SMAA_PREDICATION; you'll need to pass an extra texture in the edge
+ *    detection pass.
+ *
+ * b) If you want to enable temporal supersampling (SMAA T2x):
+ *
+ * 1. The first step is to render using subpixel jitters. I won't go into
+ *    detail, but it's as simple as moving each vertex position in the
+ *    vertex shader, you can check how we do it in our DX10 demo.
+ *
+ * 2. Then, you must setup the temporal resolve. You may want to take a look
+ *    into SMAAResolve for resolving 2x modes. After you get it working, you'll
+ *    probably see ghosting everywhere. But fear not, you can enable the
+ *    CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro.
+ *    Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded.
+ *
+ * 3. The next step is to apply SMAA to each subpixel jittered frame, just as
+ *    done for 1x.
+ *
+ * 4. At this point you should already have something usable, but for best
+ *    results the proper area textures must be set depending on current jitter.
+ *    For this, the parameter 'subsampleIndices' of
+ *    'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x
+ *    mode:
+ *
+ *    @SUBSAMPLE_INDICES
+ *
+ *    | S# |  Camera Jitter   |  subsampleIndices    |
+ *    +----+------------------+---------------------+
+ *    |  0 |  ( 0.25, -0.25)  |  float4(1, 1, 1, 0)  |
+ *    |  1 |  (-0.25,  0.25)  |  float4(2, 2, 2, 0)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. S# stands for the
+ *    sample number.
+ *
+ * More information about temporal supersampling here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * c) If you want to enable spatial multisampling (SMAA S2x):
+ *
+ * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be
+ *    created with:
+ *      - DX10:     see below (*)
+ *      - DX10.1:   D3D10_STANDARD_MULTISAMPLE_PATTERN or
+ *      - DX11:     D3D11_STANDARD_MULTISAMPLE_PATTERN
+ *
+ *    This allows to ensure that the subsample order matches the table in
+ *    @SUBSAMPLE_INDICES.
+ *
+ *    (*) In the case of DX10, we refer the reader to:
+ *      - SMAA::detectMSAAOrder and
+ *      - SMAA::msaaReorder
+ *
+ *    These functions allow to match the standard multisample patterns by
+ *    detecting the subsample order for a specific GPU, and reordering
+ *    them appropriately.
+ *
+ * 2. A shader must be run to output each subsample into a separate buffer
+ *    (DX10 is required). You can use SMAASeparate for this purpose, or just do
+ *    it in an existing pass (for example, in the tone mapping pass, which has
+ *    the advantage of feeding tone mapped subsamples to SMAA, which will yield
+ *    better results).
+ *
+ * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing
+ *    the results in the final buffer. The second run should alpha blend with
+ *    the existing final buffer using a blending factor of 0.5.
+ *    'subsampleIndices' must be adjusted as in the SMAA T2x case (see point
+ *    b).
+ *
+ * d) If you want to enable temporal supersampling on top of SMAA S2x
+ *    (which actually is SMAA 4x):
+ *
+ * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is
+ *    to calculate SMAA S2x for current frame. In this case, 'subsampleIndices'
+ *    must be set as follows:
+ *
+ *    | F# | S# |   Camera Jitter    |    Net Jitter     |   subsampleIndices   |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  0 |  0 |  ( 0.125,  0.125)  |  ( 0.375, -0.125) |  float4(5, 3, 1, 3)  |
+ *    |  0 |  1 |  ( 0.125,  0.125)  |  (-0.125,  0.375) |  float4(4, 6, 2, 3)  |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  1 |  2 |  (-0.125, -0.125)  |  ( 0.125, -0.375) |  float4(3, 5, 1, 4)  |
+ *    |  1 |  3 |  (-0.125, -0.125)  |  (-0.375,  0.125) |  float4(6, 4, 2, 4)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. F# stands for the
+ *    frame number. S# stands for the sample number.
+ *
+ * 2. After calculating SMAA S2x for current frame (with the new subsample
+ *    indices), previous frame must be reprojected as in SMAA T2x mode (see
+ *    point b).
+ *
+ * e) If motion blur is used, you may want to do the edge detection pass
+ *    together with motion blur. This has two advantages:
+ *
+ * 1. Pixels under heavy motion can be omitted from the edge detection process.
+ *    For these pixels we can just store "no edge", as motion blur will take
+ *    care of them.
+ * 2. The center pixel tap is reused.
+ *
+ * Note that in this case depth testing should be used instead of stenciling,
+ * as we have to write all the pixels in the motion blur pass.
+ *
+ * That's it!
+ */
+
+//-----------------------------------------------------------------------------
+// SMAA Presets
+
+/**
+ * Note that if you use one of these presets, the following configuration
+ * macros will be ignored if set in the "Configurable Defines" section.
+ */
+
+#if defined(SMAA_PRESET_LOW)
+#define SMAA_THRESHOLD 0.15
+#define SMAA_MAX_SEARCH_STEPS 4
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_MEDIUM)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 8
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_HIGH)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 16
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#define SMAA_CORNER_ROUNDING 25
+#elif defined(SMAA_PRESET_ULTRA)
+#define SMAA_THRESHOLD 0.05
+#define SMAA_MAX_SEARCH_STEPS 32
+#define SMAA_MAX_SEARCH_STEPS_DIAG 16
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+//-----------------------------------------------------------------------------
+// Configurable Defines
+
+/**
+ * SMAA_THRESHOLD specifies the threshold or sensitivity to edges.
+ * Lowering this value you will be able to detect more edges at the expense of
+ * performance. 
+ *
+ * Range: [0, 0.5]
+ *   0.1 is a reasonable value, and allows to catch most visible edges.
+ *   0.05 is a rather overkill value, that allows to catch 'em all.
+ *
+ *   If temporal supersampling is used, 0.2 could be a reasonable value, as low
+ *   contrast edges are properly filtered by just 2x.
+ */
+#ifndef SMAA_THRESHOLD
+#define SMAA_THRESHOLD 0.1
+#endif
+
+/**
+ * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection.
+ * 
+ * Range: depends on the depth range of the scene.
+ */
+#ifndef SMAA_DEPTH_THRESHOLD
+#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD)
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the
+ * horizontal/vertical pattern searches, at each side of the pixel.
+ *
+ * In number of pixels, it's actually the double. So the maximum line length
+ * perfectly handled by, for example 16, is 64 (by perfectly, we meant that
+ * longer lines won't look as good, but still antialiased).
+ *
+ * Range: [0, 112]
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS
+#define SMAA_MAX_SEARCH_STEPS 16
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the
+ * diagonal pattern searches, at each side of the pixel. In this case we jump
+ * one pixel at time, instead of two.
+ *
+ * Range: [0, 20]
+ *
+ * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 
+ * steps), but it can have a significant impact on older machines.
+ *
+ * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing.
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS_DIAG
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#endif
+
+/**
+ * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded.
+ *
+ * Range: [0, 100]
+ *
+ * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing.
+ */
+#ifndef SMAA_CORNER_ROUNDING
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+/**
+ * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times
+ * bigger contrast than current edge, current edge will be discarded.
+ *
+ * This allows to eliminate spurious crossing edges, and is based on the fact
+ * that, if there is too much contrast in a direction, that will hide
+ * perceptually contrast in the other neighbors.
+ */
+#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR
+#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0
+#endif
+
+/**
+ * Predicated thresholding allows to better preserve texture details and to
+ * improve performance, by decreasing the number of detected edges using an
+ * additional buffer like the light accumulation buffer, object ids or even the
+ * depth buffer (the depth buffer usage may be limited to indoor or short range
+ * scenes).
+ *
+ * It locally decreases the luma or color threshold if an edge is found in an
+ * additional buffer (so the global threshold can be higher).
+ *
+ * This method was developed by Playstation EDGE MLAA team, and used in 
+ * Killzone 3, by using the light accumulation buffer. More information here:
+ *     http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx 
+ */
+#ifndef SMAA_PREDICATION
+#define SMAA_PREDICATION 0
+#endif
+
+/**
+ * Threshold to be used in the additional predication buffer. 
+ *
+ * Range: depends on the input, so you'll have to find the magic number that
+ * works for you.
+ */
+#ifndef SMAA_PREDICATION_THRESHOLD
+#define SMAA_PREDICATION_THRESHOLD 0.01
+#endif
+
+/**
+ * How much to scale the global threshold used for luma or color edge
+ * detection when using predication.
+ *
+ * Range: [1, 5]
+ */
+#ifndef SMAA_PREDICATION_SCALE
+#define SMAA_PREDICATION_SCALE 2.0
+#endif
+
+/**
+ * How much to locally decrease the threshold.
+ *
+ * Range: [0, 1]
+ */
+#ifndef SMAA_PREDICATION_STRENGTH
+#define SMAA_PREDICATION_STRENGTH 0.4
+#endif
+
+/**
+ * Temporal reprojection allows to remove ghosting artifacts when using
+ * temporal supersampling. We use the CryEngine 3 method which also introduces
+ * velocity weighting. This feature is of extreme importance for totally
+ * removing ghosting. More information here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * Note that you'll need to setup a velocity buffer for enabling reprojection.
+ * For static geometry, saving the previous depth buffer is a viable
+ * alternative.
+ */
+#ifndef SMAA_REPROJECTION
+#define SMAA_REPROJECTION 0
+#endif
+
+/**
+ * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to
+ * remove ghosting trails behind the moving object, which are not removed by
+ * just using reprojection. Using low values will exhibit ghosting, while using
+ * high values will disable temporal supersampling under motion.
+ *
+ * Behind the scenes, velocity weighting removes temporal supersampling when
+ * the velocity of the subsamples differs (meaning they are different objects).
+ *
+ * Range: [0, 80]
+ */
+#ifndef SMAA_REPROJECTION_WEIGHT_SCALE
+#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0
+#endif
+
+/**
+ * On some compilers, discard cannot be used in vertex shaders. Thus, they need
+ * to be compiled separately.
+ */
+#ifndef SMAA_INCLUDE_VS
+#define SMAA_INCLUDE_VS 1
+#endif
+#ifndef SMAA_INCLUDE_PS
+#define SMAA_INCLUDE_PS 1
+#endif
+
+//-----------------------------------------------------------------------------
+// Texture Access Defines
+
+#ifndef SMAA_AREATEX_SELECT
+#if defined(SMAA_HLSL_3)
+#define SMAA_AREATEX_SELECT(sample) sample.ra
+#else
+#define SMAA_AREATEX_SELECT(sample) sample.rg
+#endif
+#endif
+
+#ifndef SMAA_SEARCHTEX_SELECT
+#define SMAA_SEARCHTEX_SELECT(sample) sample.r
+#endif
+
+#ifndef SMAA_DECODE_VELOCITY
+#define SMAA_DECODE_VELOCITY(sample) sample.rg
+#endif
+
+//-----------------------------------------------------------------------------
+// Non-Configurable Defines
+
+#define SMAA_AREATEX_MAX_DISTANCE 16
+#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20
+#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0))
+#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0)
+#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0)
+#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0)
+#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0)
+
+//-----------------------------------------------------------------------------
+// Porting Functions
+
+#if defined(SMAA_HLSL_3)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0))
+#define SMAASample(tex, coord) tex2D(tex, coord)
+#define SMAASamplePoint(tex, coord) tex2D(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#endif
+#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1)
+SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+#define SMAATexture2D(tex) Texture2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0)
+#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset)
+#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord)
+#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord)
+#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex
+#define SMAALoad(tex, pos, sample) tex.Load(pos, sample)
+#if defined(SMAA_HLSL_4_1)
+#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0)
+#endif
+#endif
+#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset)
+#define SMAASample(tex, coord) texture(tex, coord)
+#define SMAASamplePoint(tex, coord) texture(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset)
+#define SMAA_FLATTEN
+#define SMAA_BRANCH
+#define lerp(a, b, t) mix(a, b, t)
+#define saturate(a) clamp(a, 0.0, 1.0)
+#if defined(SMAA_GLSL_4)
+#define mad(a, b, c) fma(a, b, c)
+#define SMAAGather(tex, coord) textureGather(tex, coord)
+#else
+#define mad(a, b, c) (a * b + c)
+#endif
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define int2 ivec2
+#define int3 ivec3
+#define int4 ivec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#endif
+
+#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL)
+#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL
+#endif
+
+//-----------------------------------------------------------------------------
+// Misc functions
+
+/**
+ * Gathers current pixel, and the top-left neighbors.
+ */
+float3 SMAAGatherNeighbours(float2 texcoord,
+                            float4 offset[3],
+                            SMAATexture2D(tex)) {
+    #ifdef SMAAGather
+    return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb;
+    #else
+    float P = SMAASamplePoint(tex, texcoord).r;
+    float Pleft = SMAASamplePoint(tex, offset[0].xy).r;
+    float Ptop  = SMAASamplePoint(tex, offset[0].zw).r;
+    return float3(P, Pleft, Ptop);
+    #endif
+}
+
+/**
+ * Adjusts the threshold by means of predication.
+ */
+float2 SMAACalculatePredicatedThreshold(float2 texcoord,
+                                        float4 offset[3],
+                                        SMAATexture2D(predicationTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex));
+    float2 delta = abs(neighbours.xx - neighbours.yz);
+    float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta);
+    return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges);
+}
+
+/**
+ * Conditional move:
+ */
+void SMAAMovc(bool2 cond, inout float2 variable, float2 value) {
+    SMAA_FLATTEN if (cond.x) variable.x = value.x;
+    SMAA_FLATTEN if (cond.y) variable.y = value.y;
+}
+
+void SMAAMovc(bool4 cond, inout float4 variable, float4 value) {
+    SMAAMovc(cond.xy, variable.xy, value.xy);
+    SMAAMovc(cond.zw, variable.zw, value.zw);
+}
+
+
+#if SMAA_INCLUDE_VS
+//-----------------------------------------------------------------------------
+// Vertex Shaders
+
+/**
+ * Edge Detection Vertex Shader
+ */
+void SMAAEdgeDetectionVS(float2 texcoord,
+                         out float4 offset[3]) {
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+    offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy);
+}
+
+/**
+ * Blend Weight Calculation Vertex Shader
+ */
+void SMAABlendingWeightCalculationVS(float2 texcoord,
+                                     out float2 pixcoord,
+                                     out float4 offset[3]) {
+    pixcoord = texcoord * SMAA_RT_METRICS.zw;
+
+    // We will use these offsets for the searches later on (see @PSEUDO_GATHER4):
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125,  1.25, -0.125), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125,  1.25), texcoord.xyxy);
+
+    // And these for the searches, they indicate the ends of the loops:
+    offset[2] = mad(SMAA_RT_METRICS.xxyy,
+                    float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS),
+                    float4(offset[0].xz, offset[1].yw));
+}
+
+/**
+ * Neighborhood Blending Vertex Shader
+ */
+void SMAANeighborhoodBlendingVS(float2 texcoord,
+                                out float4 offset) {
+    offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+}
+#endif // SMAA_INCLUDE_VS
+
+#if SMAA_INCLUDE_PS
+//-----------------------------------------------------------------------------
+// Edge Detection Pixel Shaders (First Pass)
+
+/**
+ * Luma Edge Detection
+ *
+ * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAALumaEdgeDetectionPS(float2 texcoord,
+                               float4 offset[3],
+                               SMAATexture2D(colorTex)
+                               #if SMAA_PREDICATION
+                               , SMAATexture2D(predicationTex)
+                               #endif
+                               ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex));
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate lumas:
+    float3 weights = float3(0.2126, 0.7152, 0.0722);
+    float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights);
+
+    float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights);
+    float Ltop  = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights);
+
+    // We do the usual threshold:
+    float4 delta;
+    delta.xy = abs(L - float2(Lleft, Ltop));
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights);
+    float Lbottom  = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights);
+    delta.zw = abs(L - float2(Lright, Lbottom));
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights);
+    float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights);
+    delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop));
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Color Edge Detection
+ *
+ * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAAColorEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(colorTex)
+                                #if SMAA_PREDICATION
+                                , SMAATexture2D(predicationTex)
+                                #endif
+                                ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex);
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate color deltas:
+    float4 delta;
+    float3 C = SMAASamplePoint(colorTex, texcoord).rgb;
+
+    float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb;
+    float3 t = abs(C - Cleft);
+    delta.x = max(max(t.r, t.g), t.b);
+
+    float3 Ctop  = SMAASamplePoint(colorTex, offset[0].zw).rgb;
+    t = abs(C - Ctop);
+    delta.y = max(max(t.r, t.g), t.b);
+
+    // We do the usual threshold:
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb;
+    t = abs(C - Cright);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Cbottom  = SMAASamplePoint(colorTex, offset[1].zw).rgb;
+    t = abs(C - Cbottom);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float3 Cleftleft  = SMAASamplePoint(colorTex, offset[2].xy).rgb;
+    t = abs(C - Cleftleft);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb;
+    t = abs(C - Ctoptop);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Depth Edge Detection
+ */
+float2 SMAADepthEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(depthTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex));
+    float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z));
+    float2 edges = step(SMAA_DEPTH_THRESHOLD, delta);
+
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    return edges;
+}
+
+//-----------------------------------------------------------------------------
+// Diagonal Search Functions
+
+#if !defined(SMAA_DISABLE_DIAG_DETECTION)
+
+/**
+ * Allows to decode two binary values from a bilinear-filtered access.
+ */
+float2 SMAADecodeDiagBilinearAccess(float2 e) {
+    // Bilinear access for fetching 'e' have a 0.25 offset, and we are
+    // interested in the R and G edges:
+    //
+    // +---G---+-------+
+    // |   x o R   x   |
+    // +-------+-------+
+    //
+    // Then, if one of these edge is enabled:
+    //   Red:   (0.75 * X + 0.25 * 1) => 0.25 or 1.0
+    //   Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0
+    //
+    // This function will unpack the values (mad + mul + round):
+    // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1
+    e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75);
+    return round(e);
+}
+
+float4 SMAADecodeDiagBilinearAccess(float4 e) {
+    e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75);
+    return round(e);
+}
+
+/**
+ * These functions allows to perform diagonal pattern searches.
+ */
+float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+
+        // @SearchDiag2Optimization
+        // Fetch both edges at once using bilinear filtering:
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        e = SMAADecodeDiagBilinearAccess(e);
+
+        // Non-optimized version:
+        // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g;
+        // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r;
+
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+/** 
+ * Similar to SMAAArea, this calculates the area corresponding to a certain
+ * diagonal distance and crossing edges 'e'.
+ */
+float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) {
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist);
+
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Diagonal areas are on the second half of the texture:
+    texcoord.x += 0.5;
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset;
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+/**
+ * This searches for diagonal patterns and returns the corresponding weights.
+ */
+float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) {
+    float2 weights = float2(0.0, 0.0);
+
+    // Search for the line ends:
+    float4 d;
+    float2 end;
+    if (e.r > 0.0) {
+        d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0,  1.0), end);
+        d.x += float(end.y > 0.9);
+    } else
+        d.xz = float2(0.0, 0.0);
+    d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).rg;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).rg;
+        c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw);
+
+        // Non-optimized version:
+        // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        // float4 c;
+        // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0,  0)).r;
+        // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).g;
+        // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r;
+
+        // Merge crossing edges at each side into a single value:
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z);
+    }
+
+    // Search for the line ends:
+    d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end);
+    if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) {
+        d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end);
+        d.y += float(end.y > 0.9);
+    } else
+        d.yw = float2(0.0, 0.0);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.x  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        c.y  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).gr;
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr;
+    }
+
+    return weights;
+}
+#endif
+
+//-----------------------------------------------------------------------------
+// Horizontal/Vertical Search Functions
+
+/**
+ * This allows to determine how much length should we add in the last step
+ * of the searches. It takes the bilinearly interpolated edge (see 
+ * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and
+ * crossing edges are active.
+ */
+float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) {
+    // The texture is flipped vertically, with left and right cases taking half
+    // of the space horizontally:
+    float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0);
+    float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0);
+
+    // Scale and bias to access texel centers:
+    scale += float2(-1.0,  1.0);
+    bias  += float2( 0.5, -0.5);
+
+    // Convert from pixel coordinates to texcoords:
+    // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped)
+    scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+    bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+
+    // Lookup the search texture:
+    return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias)));
+}
+
+/**
+ * Horizontal/vertical search functions for the 2nd pass.
+ */
+float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    /**
+     * @PSEUDO_GATHER4
+     * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to
+     * sample between edge, thus fetching four edges in a row.
+     * Sampling with different offsets in each direction allows to disambiguate
+     * which edges are active from the four fetched ones.
+     */
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x > end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+
+    // Non-optimized version:
+    // We correct the previous (-0.25, -0.125) offset we applied:
+    // texcoord.x += 0.25 * SMAA_RT_METRICS.x;
+
+    // The searches are bias by 1, so adjust the coords accordingly:
+    // texcoord.x += SMAA_RT_METRICS.x;
+
+    // Disambiguate the length added by the last step:
+    // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step
+    // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0);
+    // return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x < end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y > end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y < end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+/** 
+ * Ok, we have the distance and both crossing edges. So, what are the areas
+ * at each side of current edge?
+ */
+float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) {
+    // Rounding prevents precision errors of bilinear filtering:
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist);
+    
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y);
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+//-----------------------------------------------------------------------------
+// Corner Detection Functions
+
+void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line.
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0,  1)).r;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1,  1)).r;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y;
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Blending Weight Calculation Pixel Shader (Second Pass)
+
+float4 SMAABlendingWeightCalculationPS(float2 texcoord,
+                                       float2 pixcoord,
+                                       float4 offset[3],
+                                       SMAATexture2D(edgesTex),
+                                       SMAATexture2D(areaTex),
+                                       SMAATexture2D(searchTex),
+                                       float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES.
+    float4 weights = float4(0.0, 0.0, 0.0, 0.0);
+
+    float2 e = SMAASample(edgesTex, texcoord).rg;
+
+    SMAA_BRANCH
+    if (e.g > 0.0) { // Edge at north
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        // Diagonals have both north and west edges, so searching for them in
+        // one of the boundaries is enough.
+        weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices);
+
+        // We give priority to diagonals, so if we find a diagonal we skip 
+        // horizontal/vertical processing.
+        SMAA_BRANCH
+        if (weights.r == -weights.g) { // weights.r + weights.g == 0.0
+        #endif
+
+        float2 d;
+
+        // Find the distance to the left:
+        float3 coords;
+        coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x);
+        coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET)
+        d.x = coords.x;
+
+        // Now fetch the left crossing edges, two at a time using bilinear
+        // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to
+        // discern what value each edge has:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r;
+
+        // Find the distance to the right:
+        coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units (doing this here allow to
+        // better interleave arithmetic and memory accesses):
+        d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the right crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r;
+
+        // Ok, we know how this pattern looks like, now it is time for getting
+        // the actual area:
+        weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y);
+
+        // Fix corners:
+        coords.y = texcoord.y;
+        SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d);
+
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        } else
+            e.r = 0.0; // Skip vertical processing.
+        #endif
+    }
+
+    SMAA_BRANCH
+    if (e.r > 0.0) { // Edge at west
+        float2 d;
+
+        // Find the distance to the top:
+        float3 coords;
+        coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z);
+        coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x;
+        d.x = coords.y;
+
+        // Fetch the top crossing edges:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g;
+
+        // Find the distance to the bottom:
+        coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units:
+        d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed 
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the bottom crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g;
+
+        // Get the area for this direction:
+        weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x);
+
+        // Fix corners:
+        coords.x = texcoord.x;
+        SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d);
+    }
+
+    return weights;
+}
+
+//-----------------------------------------------------------------------------
+// Neighborhood Blending Pixel Shader (Third Pass)
+
+float4 SMAANeighborhoodBlendingPS(float2 texcoord,
+                                  float4 offset,
+                                  SMAATexture2D(colorTex),
+                                  SMAATexture2D(blendTex)
+                                  #if SMAA_REPROJECTION
+                                  , SMAATexture2D(velocityTex)
+                                  #endif
+                                  ) {
+    // Fetch the blending weights for current pixel:
+    float4 a;
+    a.x = SMAASample(blendTex, offset.xy).a; // Right
+    a.y = SMAASample(blendTex, offset.zw).g; // Top
+    a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left
+
+    // Is there any blending weight with a value greater than 0.0?
+    SMAA_BRANCH
+    if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) {
+        float4 color = SMAASampleLevelZero(colorTex, texcoord);
+
+        #if SMAA_REPROJECTION
+        float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    } else {
+        bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical)
+
+        // Calculate the blending offsets:
+        float4 blendingOffset = float4(0.0, a.y, 0.0, a.w);
+        float2 blendingWeight = a.yw;
+        SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0));
+        SMAAMovc(bool2(h, h), blendingWeight, a.xz);
+        blendingWeight /= dot(blendingWeight, float2(1.0, 1.0));
+
+        // Calculate the texture coordinates:
+        float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy);
+
+        // We exploit bilinear filtering to mix current pixel with the chosen
+        // neighbor:
+        float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy);
+        color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw);
+
+        #if SMAA_REPROJECTION
+        // Antialias velocity for proper reprojection in a later stage:
+        float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy));
+        velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    }
+}
+
+//-----------------------------------------------------------------------------
+// Temporal Resolve Pixel Shader (Optional Pass)
+
+float4 SMAAResolvePS(float2 texcoord,
+                     SMAATexture2D(currentColorTex),
+                     SMAATexture2D(previousColorTex)
+                     #if SMAA_REPROJECTION
+                     , SMAATexture2D(velocityTex)
+                     #endif
+                     ) {
+    #if SMAA_REPROJECTION
+    // Velocity is assumed to be calculated for motion blur, so we need to
+    // inverse it for reprojection:
+    float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg);
+
+    // Fetch current pixel:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+
+    // Reproject current coordinates and fetch previous pixel:
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity);
+
+    // Attenuate the previous pixel if the velocity is different:
+    float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0;
+    float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE);
+
+    // Blend the pixels according to the calculated weight:
+    return lerp(current, previous, weight);
+    #else
+    // Just blend the pixels:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord);
+    return lerp(current, previous, 0.5);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Separate Multisamples Pixel Shader (Optional Pass)
+
+#ifdef SMAALoad
+void SMAASeparatePS(float4 position,
+                    float2 texcoord,
+                    out float4 target0,
+                    out float4 target1,
+                    SMAATexture2DMS2(colorTexMS)) {
+    int2 pos = int2(position.xy);
+    target0 = SMAALoad(colorTexMS, pos, 0);
+    target1 = SMAALoad(colorTexMS, pos, 1);
+}
+#endif
+
+//-----------------------------------------------------------------------------
+#endif // SMAA_INCLUDE_PS
+
+layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput;
+
+layout(binding = 1, set = 2) uniform sampler2D inputImg;
+layout( binding = 2 ) uniform invResolution
+{
+    vec2 invResolution_data;
+};
+
+void main() 
+{
+    vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+    for(int i = 0; i < 4; i++)
+    {
+        for(int j = 0; j < 4; j++)
+        {
+            ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+            vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data;
+            vec4 offset[3];
+            SMAAEdgeDetectionVS(coord, offset);
+            vec2 oColor = SMAAColorEdgeDetectionPS(coord, offset, inputImg);
+            if (oColor != float2(-2.0, -2.0))
+            {
+              imageStore(imgOutput, texelCoord, vec4(oColor, 0.0, 1.0));
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv
new file mode 100644
index 0000000000000000000000000000000000000000..1062a9e3abd0624aec557d24c06e1a161dec6874
GIT binary patch
literal 8464
zcmZ9Q33OG}6^1X6m!RNOkwH<3D54-DqKJqPl8{7$K@tSC7N6k-Vv{^d9+X<!;za3W
zCp%f&+IhCxTJ5ZzTdSSt(c00DcCFT_)%N>t&aRhtmfQ31e^2L}eeS(4Oc+1ADVs1Z
zJ20D=Z5o&5<FIUeCY+Qt=5t4PZ})=H+Li^2&seC#!P$h|PoG2ZDKR%7!{xyd4UZy+
z!pj|&WA!Z0HG$ai;9ChU;Zck$u?qcr*R-^>Z`oRDtJEs}wZZDhMZI%(4J=x=Yhdvu
zr(e2^xDs`<tXA3GU#;%g0?cz+qdG7!TB)fYlpTp~uGZ>Mb;o`Q*Y1~abkS+c#&VRh
zX@L5?)A!3;%4VSVo3pR7dn~RoI~K5}y>DGt>*ftzZGD}=H$TmHws&=O_MyA}Rm`<*
zJ1V2wszYv|#JO}Zx8}GZ>lP1H2G~k&>t!Cp4cSKUmda4AoZA|*0p`|PbvqNj4cV2<
z70z!o>HC;lcMNXbmY+*QcC-5B)mp7OjDIP+4Sl%0yDg>6xjXf-{{Pp|klias9ZMPa
zAo{@INO>qlx`%(l4ORQGPR#GY)}hTEL)FX6LtP`gs{LiV+0}-RzKz_0!GsS1Uy<Wd
zHXTe4DnqSf-E~iM;G)?j?$&%`M|Q2Nj8=zsnw8i_YRWtIvu)l|_OPtEAlc1%I{4}P
z(vU4i-(KEP9wuJ<nRWV7{qxJDy_B7eJ~*;{XRT{^t2K>TRu{8;KfRQ78Z*4LXJ?JL
zJf<=0h4Y?nUc0Wnx4mz3ch3fLHD!Hu{F?T*uJvoI)0C~RvvqQ}#@5i3T~udV-`%&a
z1!D*6^PXY`jp@xv_ZimivpWfyiF8sa&T}@pqk}V)^BLCP^E@uMJgu8E?4FVKqOKh}
zkM}&Z`$jRJx^{=UXTyK_Q1?vK6?M-+9Qob5dgR-P;(mN)564TtW~|!9Q$dZ)vyf@z
zYhqT{HVb_svz(&+0A}r;-x2)(K;~J9V_wp&Xa1S!lbMZ?Gv7hX=8|vbd=F-}mUXnv
z!257!Ym7sRBlM%lJ_Un*<~$-f$u$!>mDw7RQ@d-mr?U|AxK{O}5r?`pkIA(?D;6c)
z`;ui3w}sxE>o0B#{}Xb3&)uQV$@MIo&kEf4iHJV(ir6*JV|I=D#9B@PyS9j31a>_U
zD-Pc!V9zb;o&#pw6S2!t<RVrazOCT<@4IjL{QODEGT%pXial;;Zba;%YhBIkTIJ1Q
z&o?0s?H4m^Hzw?tg568#Ww7gwntQ<RN11}<8s@O=1-stjnjb=EtaN?<0=R})-y>ja
z$uGmM?sF{X`W{Qzm{`xh!QNr>jmL2&*!AgOLU%uM`a2&4r@!`T$!?r>_k9wWu_E?x
zGP;~{3@i1WVNT;u0q1AI?0VIGk3{`b6V_k5_t*D`!}Yd+Jum%zURI;KhU5Yo8`@*d
z9m%M#In{mFIK12L(RYm_bl){a-FHn<_g&+NGu?phZ;<({#$NdDkyE@6XL8?SO{jVQ
z9)l@?U6aqN-_O{$eefGu)csC|ZXf(shHkvy$<TeD_>Bu)yWhCbwfl_=UAy17qVD%B
zbp8FFskg9d@73docg^QvIeYMqJ%PwyhmZPy66c+}DxLL{iH|%_p_|8Ziah@X%SRp?
zELY4k4o-fpXUoh_+05MXRcg7`hQy!CiF>~4V$b$y^vCHz6WCrjKSJCA%m*S4?URz-
z+4Xr}CWFm!JGH%!2ZQB&R-V8|eIjBmbBnEO52Nm($>wa`XD}a*IIKG**`1^A5nyvf
z-Kk)?sH=V$VlL~7J!jujGr*of^vmbwNI3b(e-zk#d4Aewg5|Czhx*ZAW8_EDSsVkF
zk2=SKtrPt;ZWf$;)M*C0kEnATST5?!1{)(Eb&dziN54-1TU*|<^?8u9#(3rv5xKYv
za}#F`@5VfIIp0^V{Z!_Ki0>nNWFGxaOZ>hRw+P)>dF%Sz$%X$JU^#6*bLvYuJ)i$2
zVE@jM_xUr%-vehQ-58%+^~iS)IP&>Ssz=<pVEv8r`BY!Zs$KUA@R7{MX!AX)pU=6r
z$iEUScNhC>VP1vE?Irdm*600hP5h(8w}TztbNMzz&if`d?`q~Qq%-OE<2=NEMBVej
z*1eFr-jQywoFlk3iIZN-uKdk#0gArb=Aio>?Wd;otfAfKC)VVw?z80@on3>^aSzyg
zxQ8{UuLaBbd&INSCTG0QsqsEr>b@h?og?3c;6^0!tpm$NK5cTw$9u<j#FN>$>?|a{
z6W$SrzY&+P4&TrI-7EH8?6)Rg3$yn!f5yO`g}>W^J1%j}%yE9R!Ltx;`fBUpWIfyC
z5$%eDrRBpp`ZgErS|a8ou$<y3#!P1A<~=#_{;speiHLDNcm5unkGLnFJI`I+nApbx
zaQ)s+O`PkGy)6XmtF3--r@?7g9L06U-cE-v?rkx;oZ=|P#NN(GynC~TvvIMvGr{gH
z?tr>6vA46p^?O^IxZ>W<hSOJD{oedX4DE`exX#$yx$woktw5Jk9L1Q}+seedCu=N6
zjElXsfZdzF$?co^N~Cz7eHZy2a!f_yK2J;7`#e2ifA7seyxT`H$8V!mVDqT2X6~rd
zyX*Acg6{p=SkR3hDCj<0R~B^b`|9+Y3%dTd6?FaYtkdr;=of<@OuA>}UfK}*GKIMl
zanI4GF0f~_F}?531M4F{9+`vm@Lc&f$@z$O#bFQiv5se?J?g9hTPJ#V0oZeoZ=KI+
z4<g?~ZO?mc;^ZCXk9A!LHm5%I>skk=U2zoGWv{fmF89!fxYp3ugY~(VcZ2qe5Pjq)
z^QM#AfS9uZ(e8fjxxD-H%r8OYHX-J2WPT<hA3o0l>vJg*KAXYvlQ{unE<>y}7cpLY
z#9t0JzKleCKUhBEw}2!5GO+fDuYkRquF*ZH>u)aa##W@*vu)`0JsX6RkDgrtmQ(D@
z1SER)Y_K`?sqfhkoOZ?Gn)PvA1I*f8m*+o>6ni#;u21x=ntbG=XWPN%v}f9*XYz3#
z&jHu>YzLft_>3l>V$W)D@^R)n!ExrsYmfL{;QF3D7fwFncY`C|T-qc4d0=~HefOZQ
zzh~3L{Cs3p(tW0`LiCURydZJ%4(nLg*cT>!A+ve(wTAB}ZBgSzV7VL9xBqIe+#ci(
zxNDfNMdaro{%*UT+2QXtZPy`kzH7x#G4D;feB8m8fQxtVrRdX<Q<9H+d>JAidwhA~
z<Q?uY?%*rH=G3SD4&DH#U2zx}cW@tAd)&cSf{S<XRp|P}9eg!dANjb0uK}CWJE%SG
zpnUZAwP5eZCd68e%r_$P;qyAMKE*rudUW}?LvH|EYc68E_K1HY*!VIMHQof4kNBIw
z5pOQ-5&veecglG8pss)P>@DD8&)$k&-?O*D$w$v_0m~_#dGWpRc64*zg4Fly9dOze
z$2cT<b~9Le^z5DBV$a@%u21yr-C%v>qi62{o70|YkDkfLdAt`~-?R6@$%oJTlTWc{
zAApmOGyfpiTK3F%?GgVWaDC4{3@0D)w}K<yT-qc4BVc<r32_hV`g=A$L$@Q^qaPnl
zoV>%l;rlUg(f1B?ImNTLcdn_4`Qu2e$(;IH|F&d{J98I$@y>h#eLAuPiMpQz%f}i&
z1(s7B)`{o*(_nMz<C^9m@w??SVC{;-xVSTSg0)+-o%yp!@y>h>U7xr!cPAhDxHI>F
z&FP)d9(U#*aP;HzVEefVv6j#I7ZCaI`65`K;+^>tx_tEf%V2AHXN=b#@m~QOUq+(F
zSHbcT|21&Ln@fAde;sU}jdu^~`kQMC^EZ$gi2n1K??dW)c7NidXWvAZQ#^C;O7!eo
zU~}rDt=O|~qia_ju2~<~buY8_=-GF`#h!f^U7zUL_rUtdN6)?wHm5z)9zBzf^Y{VS
z^DOr4hv@R*^P}Wb?Aedu<m1eL0**5`UVFs<6kOl4pTWsT{LjG=Z!YZ-{|m4^i|1Tj
zf6u0g`Im^#p1RNduMqvCKMy2MKKf%_V}FfAUGqGEcuwZi7HjwoSgv?ZzeSf1pWh{)
z;yL{uPCoYe2XNFgUc2=kWY*>$8kzry$i<%i1TOCB&**ZB!<;v8XWi3Z5bX~k{@b41
uUlDUEvG>1$ja!IlH!jZf@8Bzu>r?#0U^!!4uQs{x{|8vk-#lXf?d5;9EWxh;

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl
new file mode 100644
index 0000000000..df30d727b3
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl
@@ -0,0 +1,1403 @@
+#version 430 core 
+#define SMAA_GLSL_4 1 
+
+layout (constant_id = 0) const int SMAA_PRESET_LOW = 0;
+layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0;
+layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0;
+layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0;
+layout (constant_id = 4) const float METRIC_WIDTH = 1920.0;
+layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0;
+
+#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT)
+
+layout (local_size_x = 16, local_size_y = 16) in;
+/**
+ * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com)
+ * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com)
+ * Copyright (C) 2013 Belen Masia (bmasia@unizar.es)
+ * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com)
+ * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to
+ * do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software. As clarification, there
+ * is no requirement that the copyright notice and permission be included in
+ * binary distributions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+/**
+ *                  _______  ___  ___       ___           ___
+ *                 /       ||   \/   |     /   \         /   \
+ *                |   (---- |  \  /  |    /  ^  \       /  ^  \
+ *                 \   \    |  |\/|  |   /  /_\  \     /  /_\  \
+ *              ----)   |   |  |  |  |  /  _____  \   /  _____  \
+ *             |_______/    |__|  |__| /__/     \__\ /__/     \__\
+ * 
+ *                               E N H A N C E D
+ *       S U B P I X E L   M O R P H O L O G I C A L   A N T I A L I A S I N G
+ *
+ *                         http://www.iryoku.com/smaa/
+ *
+ * Hi, welcome aboard!
+ * 
+ * Here you'll find instructions to get the shader up and running as fast as
+ * possible.
+ *
+ * IMPORTANTE NOTICE: when updating, remember to update both this file and the
+ * precomputed textures! They may change from version to version.
+ *
+ * The shader has three passes, chained together as follows:
+ *
+ *                           |input|------------------ 
+ *                              v                     |
+ *                    [ SMAA*EdgeDetection ]          |
+ *                              v                     |
+ *                          |edgesTex|                |
+ *                              v                     |
+ *              [ SMAABlendingWeightCalculation ]     |
+ *                              v                     |
+ *                          |blendTex|                |
+ *                              v                     |
+ *                [ SMAANeighborhoodBlending ] <------ 
+ *                              v
+ *                           |output|
+ *
+ * Note that each [pass] has its own vertex and pixel shader. Remember to use
+ * oversized triangles instead of quads to avoid overshading along the
+ * diagonal.
+ *
+ * You've three edge detection methods to choose from: luma, color or depth.
+ * They represent different quality/performance and anti-aliasing/sharpness
+ * tradeoffs, so our recommendation is for you to choose the one that best
+ * suits your particular scenario:
+ *
+ * - Depth edge detection is usually the fastest but it may miss some edges.
+ *
+ * - Luma edge detection is usually more expensive than depth edge detection,
+ *   but catches visible edges that depth edge detection can miss.
+ *
+ * - Color edge detection is usually the most expensive one but catches
+ *   chroma-only edges.
+ *
+ * For quickstarters: just use luma edge detection.
+ *
+ * The general advice is to not rush the integration process and ensure each
+ * step is done correctly (don't try to integrate SMAA T2x with predicated edge
+ * detection from the start!). Ok then, let's go!
+ *
+ *  1. The first step is to create two RGBA temporal render targets for holding
+ *     |edgesTex| and |blendTex|.
+ *
+ *     In DX10 or DX11, you can use a RG render target for the edges texture.
+ *     In the case of NVIDIA GPUs, using RG render targets seems to actually be
+ *     slower.
+ *
+ *     On the Xbox 360, you can use the same render target for resolving both
+ *     |edgesTex| and |blendTex|, as they aren't needed simultaneously.
+ *
+ *  2. Both temporal render targets |edgesTex| and |blendTex| must be cleared
+ *     each frame. Do not forget to clear the alpha channel!
+ *
+ *  3. The next step is loading the two supporting precalculated textures,
+ *     'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as
+ *     C++ headers, and also as regular DDS files. They'll be needed for the
+ *     'SMAABlendingWeightCalculation' pass.
+ *
+ *     If you use the C++ headers, be sure to load them in the format specified
+ *     inside of them.
+ *
+ *     You can also compress 'areaTex' and 'searchTex' using BC5 and BC4
+ *     respectively, if you have that option in your content processor pipeline.
+ *     When compressing then, you get a non-perceptible quality decrease, and a
+ *     marginal performance increase.
+ *
+ *  4. All samplers must be set to linear filtering and clamp.
+ *
+ *     After you get the technique working, remember that 64-bit inputs have
+ *     half-rate linear filtering on GCN.
+ *
+ *     If SMAA is applied to 64-bit color buffers, switching to point filtering
+ *     when accesing them will increase the performance. Search for
+ *     'SMAASamplePoint' to see which textures may benefit from point
+ *     filtering, and where (which is basically the color input in the edge
+ *     detection and resolve passes).
+ *
+ *  5. All texture reads and buffer writes must be non-sRGB, with the exception
+ *     of the input read and the output write in
+ *     'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in
+ *     this last pass are not possible, the technique will work anyway, but
+ *     will perform antialiasing in gamma space.
+ *
+ *     IMPORTANT: for best results the input read for the color/luma edge 
+ *     detection should *NOT* be sRGB.
+ *
+ *  6. Before including SMAA.h you'll have to setup the render target metrics,
+ *     the target and any optional configuration defines. Optionally you can
+ *     use a preset.
+ *
+ *     You have the following targets available: 
+ *         SMAA_HLSL_3
+ *         SMAA_HLSL_4
+ *         SMAA_HLSL_4_1
+ *         SMAA_GLSL_3 *
+ *         SMAA_GLSL_4 *
+ *
+ *         * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below).
+ *
+ *     And four presets:
+ *         SMAA_PRESET_LOW          (%60 of the quality)
+ *         SMAA_PRESET_MEDIUM       (%80 of the quality)
+ *         SMAA_PRESET_HIGH         (%95 of the quality)
+ *         SMAA_PRESET_ULTRA        (%99 of the quality)
+ *
+ *     For example:
+ *         #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0)
+ *         #define SMAA_HLSL_4
+ *         #define SMAA_PRESET_HIGH
+ *         #include "SMAA.h"
+ *
+ *     Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a
+ *     uniform variable. The code is designed to minimize the impact of not
+ *     using a constant value, but it is still better to hardcode it.
+ *
+ *     Depending on how you encoded 'areaTex' and 'searchTex', you may have to
+ *     add (and customize) the following defines before including SMAA.h:
+ *          #define SMAA_AREATEX_SELECT(sample) sample.rg
+ *          #define SMAA_SEARCHTEX_SELECT(sample) sample.r
+ *
+ *     If your engine is already using porting macros, you can define
+ *     SMAA_CUSTOM_SL, and define the porting functions by yourself.
+ *
+ *  7. Then, you'll have to setup the passes as indicated in the scheme above.
+ *     You can take a look into SMAA.fx, to see how we did it for our demo.
+ *     Checkout the function wrappers, you may want to copy-paste them!
+ *
+ *  8. It's recommended to validate the produced |edgesTex| and |blendTex|.
+ *     You can use a screenshot from your engine to compare the |edgesTex|
+ *     and |blendTex| produced inside of the engine with the results obtained
+ *     with the reference demo.
+ *
+ *  9. After you get the last pass to work, it's time to optimize. You'll have
+ *     to initialize a stencil buffer in the first pass (discard is already in
+ *     the code), then mask execution by using it the second pass. The last
+ *     pass should be executed in all pixels.
+ *
+ *
+ * After this point you can choose to enable predicated thresholding,
+ * temporal supersampling and motion blur integration:
+ *
+ * a) If you want to use predicated thresholding, take a look into
+ *    SMAA_PREDICATION; you'll need to pass an extra texture in the edge
+ *    detection pass.
+ *
+ * b) If you want to enable temporal supersampling (SMAA T2x):
+ *
+ * 1. The first step is to render using subpixel jitters. I won't go into
+ *    detail, but it's as simple as moving each vertex position in the
+ *    vertex shader, you can check how we do it in our DX10 demo.
+ *
+ * 2. Then, you must setup the temporal resolve. You may want to take a look
+ *    into SMAAResolve for resolving 2x modes. After you get it working, you'll
+ *    probably see ghosting everywhere. But fear not, you can enable the
+ *    CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro.
+ *    Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded.
+ *
+ * 3. The next step is to apply SMAA to each subpixel jittered frame, just as
+ *    done for 1x.
+ *
+ * 4. At this point you should already have something usable, but for best
+ *    results the proper area textures must be set depending on current jitter.
+ *    For this, the parameter 'subsampleIndices' of
+ *    'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x
+ *    mode:
+ *
+ *    @SUBSAMPLE_INDICES
+ *
+ *    | S# |  Camera Jitter   |  subsampleIndices    |
+ *    +----+------------------+---------------------+
+ *    |  0 |  ( 0.25, -0.25)  |  float4(1, 1, 1, 0)  |
+ *    |  1 |  (-0.25,  0.25)  |  float4(2, 2, 2, 0)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. S# stands for the
+ *    sample number.
+ *
+ * More information about temporal supersampling here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * c) If you want to enable spatial multisampling (SMAA S2x):
+ *
+ * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be
+ *    created with:
+ *      - DX10:     see below (*)
+ *      - DX10.1:   D3D10_STANDARD_MULTISAMPLE_PATTERN or
+ *      - DX11:     D3D11_STANDARD_MULTISAMPLE_PATTERN
+ *
+ *    This allows to ensure that the subsample order matches the table in
+ *    @SUBSAMPLE_INDICES.
+ *
+ *    (*) In the case of DX10, we refer the reader to:
+ *      - SMAA::detectMSAAOrder and
+ *      - SMAA::msaaReorder
+ *
+ *    These functions allow to match the standard multisample patterns by
+ *    detecting the subsample order for a specific GPU, and reordering
+ *    them appropriately.
+ *
+ * 2. A shader must be run to output each subsample into a separate buffer
+ *    (DX10 is required). You can use SMAASeparate for this purpose, or just do
+ *    it in an existing pass (for example, in the tone mapping pass, which has
+ *    the advantage of feeding tone mapped subsamples to SMAA, which will yield
+ *    better results).
+ *
+ * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing
+ *    the results in the final buffer. The second run should alpha blend with
+ *    the existing final buffer using a blending factor of 0.5.
+ *    'subsampleIndices' must be adjusted as in the SMAA T2x case (see point
+ *    b).
+ *
+ * d) If you want to enable temporal supersampling on top of SMAA S2x
+ *    (which actually is SMAA 4x):
+ *
+ * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is
+ *    to calculate SMAA S2x for current frame. In this case, 'subsampleIndices'
+ *    must be set as follows:
+ *
+ *    | F# | S# |   Camera Jitter    |    Net Jitter     |   subsampleIndices   |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  0 |  0 |  ( 0.125,  0.125)  |  ( 0.375, -0.125) |  float4(5, 3, 1, 3)  |
+ *    |  0 |  1 |  ( 0.125,  0.125)  |  (-0.125,  0.375) |  float4(4, 6, 2, 3)  |
+ *    +----+----+--------------------+-------------------+----------------------+
+ *    |  1 |  2 |  (-0.125, -0.125)  |  ( 0.125, -0.375) |  float4(3, 5, 1, 4)  |
+ *    |  1 |  3 |  (-0.125, -0.125)  |  (-0.375,  0.125) |  float4(6, 4, 2, 4)  |
+ *
+ *    These jitter positions assume a bottom-to-top y axis. F# stands for the
+ *    frame number. S# stands for the sample number.
+ *
+ * 2. After calculating SMAA S2x for current frame (with the new subsample
+ *    indices), previous frame must be reprojected as in SMAA T2x mode (see
+ *    point b).
+ *
+ * e) If motion blur is used, you may want to do the edge detection pass
+ *    together with motion blur. This has two advantages:
+ *
+ * 1. Pixels under heavy motion can be omitted from the edge detection process.
+ *    For these pixels we can just store "no edge", as motion blur will take
+ *    care of them.
+ * 2. The center pixel tap is reused.
+ *
+ * Note that in this case depth testing should be used instead of stenciling,
+ * as we have to write all the pixels in the motion blur pass.
+ *
+ * That's it!
+ */
+
+//-----------------------------------------------------------------------------
+// SMAA Presets
+
+/**
+ * Note that if you use one of these presets, the following configuration
+ * macros will be ignored if set in the "Configurable Defines" section.
+ */
+
+#if defined(SMAA_PRESET_LOW)
+#define SMAA_THRESHOLD 0.15
+#define SMAA_MAX_SEARCH_STEPS 4
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_MEDIUM)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 8
+#define SMAA_DISABLE_DIAG_DETECTION
+#define SMAA_DISABLE_CORNER_DETECTION
+#elif defined(SMAA_PRESET_HIGH)
+#define SMAA_THRESHOLD 0.1
+#define SMAA_MAX_SEARCH_STEPS 16
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#define SMAA_CORNER_ROUNDING 25
+#elif defined(SMAA_PRESET_ULTRA)
+#define SMAA_THRESHOLD 0.05
+#define SMAA_MAX_SEARCH_STEPS 32
+#define SMAA_MAX_SEARCH_STEPS_DIAG 16
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+//-----------------------------------------------------------------------------
+// Configurable Defines
+
+/**
+ * SMAA_THRESHOLD specifies the threshold or sensitivity to edges.
+ * Lowering this value you will be able to detect more edges at the expense of
+ * performance. 
+ *
+ * Range: [0, 0.5]
+ *   0.1 is a reasonable value, and allows to catch most visible edges.
+ *   0.05 is a rather overkill value, that allows to catch 'em all.
+ *
+ *   If temporal supersampling is used, 0.2 could be a reasonable value, as low
+ *   contrast edges are properly filtered by just 2x.
+ */
+#ifndef SMAA_THRESHOLD
+#define SMAA_THRESHOLD 0.1
+#endif
+
+/**
+ * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection.
+ * 
+ * Range: depends on the depth range of the scene.
+ */
+#ifndef SMAA_DEPTH_THRESHOLD
+#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD)
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the
+ * horizontal/vertical pattern searches, at each side of the pixel.
+ *
+ * In number of pixels, it's actually the double. So the maximum line length
+ * perfectly handled by, for example 16, is 64 (by perfectly, we meant that
+ * longer lines won't look as good, but still antialiased).
+ *
+ * Range: [0, 112]
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS
+#define SMAA_MAX_SEARCH_STEPS 16
+#endif
+
+/**
+ * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the
+ * diagonal pattern searches, at each side of the pixel. In this case we jump
+ * one pixel at time, instead of two.
+ *
+ * Range: [0, 20]
+ *
+ * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 
+ * steps), but it can have a significant impact on older machines.
+ *
+ * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing.
+ */
+#ifndef SMAA_MAX_SEARCH_STEPS_DIAG
+#define SMAA_MAX_SEARCH_STEPS_DIAG 8
+#endif
+
+/**
+ * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded.
+ *
+ * Range: [0, 100]
+ *
+ * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing.
+ */
+#ifndef SMAA_CORNER_ROUNDING
+#define SMAA_CORNER_ROUNDING 25
+#endif
+
+/**
+ * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times
+ * bigger contrast than current edge, current edge will be discarded.
+ *
+ * This allows to eliminate spurious crossing edges, and is based on the fact
+ * that, if there is too much contrast in a direction, that will hide
+ * perceptually contrast in the other neighbors.
+ */
+#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR
+#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0
+#endif
+
+/**
+ * Predicated thresholding allows to better preserve texture details and to
+ * improve performance, by decreasing the number of detected edges using an
+ * additional buffer like the light accumulation buffer, object ids or even the
+ * depth buffer (the depth buffer usage may be limited to indoor or short range
+ * scenes).
+ *
+ * It locally decreases the luma or color threshold if an edge is found in an
+ * additional buffer (so the global threshold can be higher).
+ *
+ * This method was developed by Playstation EDGE MLAA team, and used in 
+ * Killzone 3, by using the light accumulation buffer. More information here:
+ *     http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx 
+ */
+#ifndef SMAA_PREDICATION
+#define SMAA_PREDICATION 0
+#endif
+
+/**
+ * Threshold to be used in the additional predication buffer. 
+ *
+ * Range: depends on the input, so you'll have to find the magic number that
+ * works for you.
+ */
+#ifndef SMAA_PREDICATION_THRESHOLD
+#define SMAA_PREDICATION_THRESHOLD 0.01
+#endif
+
+/**
+ * How much to scale the global threshold used for luma or color edge
+ * detection when using predication.
+ *
+ * Range: [1, 5]
+ */
+#ifndef SMAA_PREDICATION_SCALE
+#define SMAA_PREDICATION_SCALE 2.0
+#endif
+
+/**
+ * How much to locally decrease the threshold.
+ *
+ * Range: [0, 1]
+ */
+#ifndef SMAA_PREDICATION_STRENGTH
+#define SMAA_PREDICATION_STRENGTH 0.4
+#endif
+
+/**
+ * Temporal reprojection allows to remove ghosting artifacts when using
+ * temporal supersampling. We use the CryEngine 3 method which also introduces
+ * velocity weighting. This feature is of extreme importance for totally
+ * removing ghosting. More information here:
+ *    http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf
+ *
+ * Note that you'll need to setup a velocity buffer for enabling reprojection.
+ * For static geometry, saving the previous depth buffer is a viable
+ * alternative.
+ */
+#ifndef SMAA_REPROJECTION
+#define SMAA_REPROJECTION 0
+#endif
+
+/**
+ * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to
+ * remove ghosting trails behind the moving object, which are not removed by
+ * just using reprojection. Using low values will exhibit ghosting, while using
+ * high values will disable temporal supersampling under motion.
+ *
+ * Behind the scenes, velocity weighting removes temporal supersampling when
+ * the velocity of the subsamples differs (meaning they are different objects).
+ *
+ * Range: [0, 80]
+ */
+#ifndef SMAA_REPROJECTION_WEIGHT_SCALE
+#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0
+#endif
+
+/**
+ * On some compilers, discard cannot be used in vertex shaders. Thus, they need
+ * to be compiled separately.
+ */
+#ifndef SMAA_INCLUDE_VS
+#define SMAA_INCLUDE_VS 1
+#endif
+#ifndef SMAA_INCLUDE_PS
+#define SMAA_INCLUDE_PS 1
+#endif
+
+//-----------------------------------------------------------------------------
+// Texture Access Defines
+
+#ifndef SMAA_AREATEX_SELECT
+#if defined(SMAA_HLSL_3)
+#define SMAA_AREATEX_SELECT(sample) sample.ra
+#else
+#define SMAA_AREATEX_SELECT(sample) sample.rg
+#endif
+#endif
+
+#ifndef SMAA_SEARCHTEX_SELECT
+#define SMAA_SEARCHTEX_SELECT(sample) sample.r
+#endif
+
+#ifndef SMAA_DECODE_VELOCITY
+#define SMAA_DECODE_VELOCITY(sample) sample.rg
+#endif
+
+//-----------------------------------------------------------------------------
+// Non-Configurable Defines
+
+#define SMAA_AREATEX_MAX_DISTANCE 16
+#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20
+#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0))
+#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0)
+#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0)
+#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0)
+#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0)
+
+//-----------------------------------------------------------------------------
+// Porting Functions
+
+#if defined(SMAA_HLSL_3)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0))
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0))
+#define SMAASample(tex, coord) tex2D(tex, coord)
+#define SMAASamplePoint(tex, coord) tex2D(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#endif
+#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1)
+SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
+#define SMAATexture2D(tex) Texture2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0)
+#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset)
+#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord)
+#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord)
+#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset)
+#define SMAA_FLATTEN [flatten]
+#define SMAA_BRANCH [branch]
+#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex
+#define SMAALoad(tex, pos, sample) tex.Load(pos, sample)
+#if defined(SMAA_HLSL_4_1)
+#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0)
+#endif
+#endif
+#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4)
+#define SMAATexture2D(tex) sampler2D tex
+#define SMAATexturePass2D(tex) tex
+#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0)
+#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset)
+#define SMAASample(tex, coord) texture(tex, coord)
+#define SMAASamplePoint(tex, coord) texture(tex, coord)
+#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset)
+#define SMAA_FLATTEN
+#define SMAA_BRANCH
+#define lerp(a, b, t) mix(a, b, t)
+#define saturate(a) clamp(a, 0.0, 1.0)
+#if defined(SMAA_GLSL_4)
+#define mad(a, b, c) fma(a, b, c)
+#define SMAAGather(tex, coord) textureGather(tex, coord)
+#else
+#define mad(a, b, c) (a * b + c)
+#endif
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define int2 ivec2
+#define int3 ivec3
+#define int4 ivec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#endif
+
+#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL)
+#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL
+#endif
+
+//-----------------------------------------------------------------------------
+// Misc functions
+
+/**
+ * Gathers current pixel, and the top-left neighbors.
+ */
+float3 SMAAGatherNeighbours(float2 texcoord,
+                            float4 offset[3],
+                            SMAATexture2D(tex)) {
+    #ifdef SMAAGather
+    return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb;
+    #else
+    float P = SMAASamplePoint(tex, texcoord).r;
+    float Pleft = SMAASamplePoint(tex, offset[0].xy).r;
+    float Ptop  = SMAASamplePoint(tex, offset[0].zw).r;
+    return float3(P, Pleft, Ptop);
+    #endif
+}
+
+/**
+ * Adjusts the threshold by means of predication.
+ */
+float2 SMAACalculatePredicatedThreshold(float2 texcoord,
+                                        float4 offset[3],
+                                        SMAATexture2D(predicationTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex));
+    float2 delta = abs(neighbours.xx - neighbours.yz);
+    float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta);
+    return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges);
+}
+
+/**
+ * Conditional move:
+ */
+void SMAAMovc(bool2 cond, inout float2 variable, float2 value) {
+    SMAA_FLATTEN if (cond.x) variable.x = value.x;
+    SMAA_FLATTEN if (cond.y) variable.y = value.y;
+}
+
+void SMAAMovc(bool4 cond, inout float4 variable, float4 value) {
+    SMAAMovc(cond.xy, variable.xy, value.xy);
+    SMAAMovc(cond.zw, variable.zw, value.zw);
+}
+
+
+#if SMAA_INCLUDE_VS
+//-----------------------------------------------------------------------------
+// Vertex Shaders
+
+/**
+ * Edge Detection Vertex Shader
+ */
+void SMAAEdgeDetectionVS(float2 texcoord,
+                         out float4 offset[3]) {
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+    offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy);
+}
+
+/**
+ * Blend Weight Calculation Vertex Shader
+ */
+void SMAABlendingWeightCalculationVS(float2 texcoord,
+                                     out float2 pixcoord,
+                                     out float4 offset[3]) {
+    pixcoord = texcoord * SMAA_RT_METRICS.zw;
+
+    // We will use these offsets for the searches later on (see @PSEUDO_GATHER4):
+    offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125,  1.25, -0.125), texcoord.xyxy);
+    offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125,  1.25), texcoord.xyxy);
+
+    // And these for the searches, they indicate the ends of the loops:
+    offset[2] = mad(SMAA_RT_METRICS.xxyy,
+                    float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS),
+                    float4(offset[0].xz, offset[1].yw));
+}
+
+/**
+ * Neighborhood Blending Vertex Shader
+ */
+void SMAANeighborhoodBlendingVS(float2 texcoord,
+                                out float4 offset) {
+    offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0,  1.0), texcoord.xyxy);
+}
+#endif // SMAA_INCLUDE_VS
+
+#if SMAA_INCLUDE_PS
+//-----------------------------------------------------------------------------
+// Edge Detection Pixel Shaders (First Pass)
+
+/**
+ * Luma Edge Detection
+ *
+ * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAALumaEdgeDetectionPS(float2 texcoord,
+                               float4 offset[3],
+                               SMAATexture2D(colorTex)
+                               #if SMAA_PREDICATION
+                               , SMAATexture2D(predicationTex)
+                               #endif
+                               ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex));
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate lumas:
+    float3 weights = float3(0.2126, 0.7152, 0.0722);
+    float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights);
+
+    float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights);
+    float Ltop  = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights);
+
+    // We do the usual threshold:
+    float4 delta;
+    delta.xy = abs(L - float2(Lleft, Ltop));
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights);
+    float Lbottom  = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights);
+    delta.zw = abs(L - float2(Lright, Lbottom));
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights);
+    float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights);
+    delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop));
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Color Edge Detection
+ *
+ * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and
+ * thus 'colorTex' should be a non-sRGB texture.
+ */
+float2 SMAAColorEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(colorTex)
+                                #if SMAA_PREDICATION
+                                , SMAATexture2D(predicationTex)
+                                #endif
+                                ) {
+    // Calculate the threshold:
+    #if SMAA_PREDICATION
+    float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex);
+    #else
+    float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD);
+    #endif
+
+    // Calculate color deltas:
+    float4 delta;
+    float3 C = SMAASamplePoint(colorTex, texcoord).rgb;
+
+    float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb;
+    float3 t = abs(C - Cleft);
+    delta.x = max(max(t.r, t.g), t.b);
+
+    float3 Ctop  = SMAASamplePoint(colorTex, offset[0].zw).rgb;
+    t = abs(C - Ctop);
+    delta.y = max(max(t.r, t.g), t.b);
+
+    // We do the usual threshold:
+    float2 edges = step(threshold, delta.xy);
+
+    // Then discard if there is no edge:
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    // Calculate right and bottom deltas:
+    float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb;
+    t = abs(C - Cright);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Cbottom  = SMAASamplePoint(colorTex, offset[1].zw).rgb;
+    t = abs(C - Cbottom);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the maximum delta in the direct neighborhood:
+    float2 maxDelta = max(delta.xy, delta.zw);
+
+    // Calculate left-left and top-top deltas:
+    float3 Cleftleft  = SMAASamplePoint(colorTex, offset[2].xy).rgb;
+    t = abs(C - Cleftleft);
+    delta.z = max(max(t.r, t.g), t.b);
+
+    float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb;
+    t = abs(C - Ctoptop);
+    delta.w = max(max(t.r, t.g), t.b);
+
+    // Calculate the final maximum delta:
+    maxDelta = max(maxDelta.xy, delta.zw);
+    float finalDelta = max(maxDelta.x, maxDelta.y);
+
+    // Local contrast adaptation:
+    edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy);
+
+    return edges;
+}
+
+/**
+ * Depth Edge Detection
+ */
+float2 SMAADepthEdgeDetectionPS(float2 texcoord,
+                                float4 offset[3],
+                                SMAATexture2D(depthTex)) {
+    float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex));
+    float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z));
+    float2 edges = step(SMAA_DEPTH_THRESHOLD, delta);
+
+    if (dot(edges, float2(1.0, 1.0)) == 0.0)
+        return float2(-2.0, -2.0);
+
+    return edges;
+}
+
+//-----------------------------------------------------------------------------
+// Diagonal Search Functions
+
+#if !defined(SMAA_DISABLE_DIAG_DETECTION)
+
+/**
+ * Allows to decode two binary values from a bilinear-filtered access.
+ */
+float2 SMAADecodeDiagBilinearAccess(float2 e) {
+    // Bilinear access for fetching 'e' have a 0.25 offset, and we are
+    // interested in the R and G edges:
+    //
+    // +---G---+-------+
+    // |   x o R   x   |
+    // +-------+-------+
+    //
+    // Then, if one of these edge is enabled:
+    //   Red:   (0.75 * X + 0.25 * 1) => 0.25 or 1.0
+    //   Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0
+    //
+    // This function will unpack the values (mad + mul + round):
+    // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1
+    e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75);
+    return round(e);
+}
+
+float4 SMAADecodeDiagBilinearAccess(float4 e) {
+    e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75);
+    return round(e);
+}
+
+/**
+ * These functions allows to perform diagonal pattern searches.
+ */
+float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) {
+    float4 coord = float4(texcoord, -1.0, 1.0);
+    coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization
+    float3 t = float3(SMAA_RT_METRICS.xy, 1.0);
+    while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) &&
+           coord.w > 0.9) {
+        coord.xyz = mad(t, float3(dir, 1.0), coord.xyz);
+
+        // @SearchDiag2Optimization
+        // Fetch both edges at once using bilinear filtering:
+        e = SMAASampleLevelZero(edgesTex, coord.xy).rg;
+        e = SMAADecodeDiagBilinearAccess(e);
+
+        // Non-optimized version:
+        // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g;
+        // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r;
+
+        coord.w = dot(e, float2(0.5, 0.5));
+    }
+    return coord.zw;
+}
+
+/** 
+ * Similar to SMAAArea, this calculates the area corresponding to a certain
+ * diagonal distance and crossing edges 'e'.
+ */
+float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) {
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist);
+
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Diagonal areas are on the second half of the texture:
+    texcoord.x += 0.5;
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset;
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+/**
+ * This searches for diagonal patterns and returns the corresponding weights.
+ */
+float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) {
+    float2 weights = float2(0.0, 0.0);
+
+    // Search for the line ends:
+    float4 d;
+    float2 end;
+    if (e.r > 0.0) {
+        d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0,  1.0), end);
+        d.x += float(end.y > 0.9);
+    } else
+        d.xz = float2(0.0, 0.0);
+    d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).rg;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).rg;
+        c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw);
+
+        // Non-optimized version:
+        // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        // float4 c;
+        // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0,  0)).r;
+        // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).g;
+        // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r;
+
+        // Merge crossing edges at each side into a single value:
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z);
+    }
+
+    // Search for the line ends:
+    d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end);
+    if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) {
+        d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end);
+        d.y += float(end.y > 0.9);
+    } else
+        d.yw = float2(0.0, 0.0);
+
+    SMAA_BRANCH
+    if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3
+        // Fetch the crossing edges:
+        float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy);
+        float4 c;
+        c.x  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1,  0)).g;
+        c.y  = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r;
+        c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1,  0)).gr;
+        float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw);
+
+        // Remove the crossing edge if we didn't found the end of the line:
+        SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0));
+
+        // Fetch the areas for this line:
+        weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr;
+    }
+
+    return weights;
+}
+#endif
+
+//-----------------------------------------------------------------------------
+// Horizontal/Vertical Search Functions
+
+/**
+ * This allows to determine how much length should we add in the last step
+ * of the searches. It takes the bilinearly interpolated edge (see 
+ * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and
+ * crossing edges are active.
+ */
+float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) {
+    // The texture is flipped vertically, with left and right cases taking half
+    // of the space horizontally:
+    float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0);
+    float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0);
+
+    // Scale and bias to access texel centers:
+    scale += float2(-1.0,  1.0);
+    bias  += float2( 0.5, -0.5);
+
+    // Convert from pixel coordinates to texcoords:
+    // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped)
+    scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+    bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE;
+
+    // Lookup the search texture:
+    return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias)));
+}
+
+/**
+ * Horizontal/vertical search functions for the 2nd pass.
+ */
+float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    /**
+     * @PSEUDO_GATHER4
+     * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to
+     * sample between edge, thus fetching four edges in a row.
+     * Sampling with different offsets in each direction allows to disambiguate
+     * which edges are active from the four fetched ones.
+     */
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x > end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+
+    // Non-optimized version:
+    // We correct the previous (-0.25, -0.125) offset we applied:
+    // texcoord.x += 0.25 * SMAA_RT_METRICS.x;
+
+    // The searches are bias by 1, so adjust the coords accordingly:
+    // texcoord.x += SMAA_RT_METRICS.x;
+
+    // Disambiguate the length added by the last step:
+    // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step
+    // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0);
+    // return mad(SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(0.0, 1.0);
+    while (texcoord.x < end && 
+           e.g > 0.8281 && // Is there some edge not activated?
+           e.r == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.x, offset, texcoord.x);
+}
+
+float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y > end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25);
+    return mad(SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) {
+    float2 e = float2(1.0, 0.0);
+    while (texcoord.y < end && 
+           e.r > 0.8281 && // Is there some edge not activated?
+           e.g == 0.0) { // Or is there a crossing edge that breaks the line?
+        e = SMAASampleLevelZero(edgesTex, texcoord).rg;
+        texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord);
+    }
+    float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25);
+    return mad(-SMAA_RT_METRICS.y, offset, texcoord.y);
+}
+
+/** 
+ * Ok, we have the distance and both crossing edges. So, what are the areas
+ * at each side of current edge?
+ */
+float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) {
+    // Rounding prevents precision errors of bilinear filtering:
+    float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist);
+    
+    // We do a scale and bias for mapping to texel space:
+    texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE);
+
+    // Move to proper place, according to the subpixel offset:
+    texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y);
+
+    // Do it!
+    return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord));
+}
+
+//-----------------------------------------------------------------------------
+// Corner Detection Functions
+
+void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line.
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0,  1)).r;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1,  1)).r;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) {
+    #if !defined(SMAA_DISABLE_CORNER_DETECTION)
+    float2 leftRight = step(d.xy, d.yx);
+    float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight;
+
+    rounding /= leftRight.x + leftRight.y;
+
+    float2 factor = float2(1.0, 1.0);
+    factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g;
+    factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g;
+    factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g;
+    factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g;
+
+    weights *= saturate(factor);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Blending Weight Calculation Pixel Shader (Second Pass)
+
+float4 SMAABlendingWeightCalculationPS(float2 texcoord,
+                                       float2 pixcoord,
+                                       float4 offset[3],
+                                       SMAATexture2D(edgesTex),
+                                       SMAATexture2D(areaTex),
+                                       SMAATexture2D(searchTex),
+                                       float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES.
+    float4 weights = float4(0.0, 0.0, 0.0, 0.0);
+
+    float2 e = SMAASample(edgesTex, texcoord).rg;
+
+    SMAA_BRANCH
+    if (e.g > 0.0) { // Edge at north
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        // Diagonals have both north and west edges, so searching for them in
+        // one of the boundaries is enough.
+        weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices);
+
+        // We give priority to diagonals, so if we find a diagonal we skip 
+        // horizontal/vertical processing.
+        SMAA_BRANCH
+        if (weights.r == -weights.g) { // weights.r + weights.g == 0.0
+        #endif
+
+        float2 d;
+
+        // Find the distance to the left:
+        float3 coords;
+        coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x);
+        coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET)
+        d.x = coords.x;
+
+        // Now fetch the left crossing edges, two at a time using bilinear
+        // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to
+        // discern what value each edge has:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r;
+
+        // Find the distance to the right:
+        coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units (doing this here allow to
+        // better interleave arithmetic and memory accesses):
+        d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the right crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r;
+
+        // Ok, we know how this pattern looks like, now it is time for getting
+        // the actual area:
+        weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y);
+
+        // Fix corners:
+        coords.y = texcoord.y;
+        SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d);
+
+        #if !defined(SMAA_DISABLE_DIAG_DETECTION)
+        } else
+            e.r = 0.0; // Skip vertical processing.
+        #endif
+    }
+
+    SMAA_BRANCH
+    if (e.r > 0.0) { // Edge at west
+        float2 d;
+
+        // Find the distance to the top:
+        float3 coords;
+        coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z);
+        coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x;
+        d.x = coords.y;
+
+        // Fetch the top crossing edges:
+        float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g;
+
+        // Find the distance to the bottom:
+        coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w);
+        d.y = coords.z;
+
+        // We want the distances to be in pixel units:
+        d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy)));
+
+        // SMAAArea below needs a sqrt, as the areas texture is compressed 
+        // quadratically:
+        float2 sqrt_d = sqrt(d);
+
+        // Fetch the bottom crossing edges:
+        float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g;
+
+        // Get the area for this direction:
+        weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x);
+
+        // Fix corners:
+        coords.x = texcoord.x;
+        SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d);
+    }
+
+    return weights;
+}
+
+//-----------------------------------------------------------------------------
+// Neighborhood Blending Pixel Shader (Third Pass)
+
+float4 SMAANeighborhoodBlendingPS(float2 texcoord,
+                                  float4 offset,
+                                  SMAATexture2D(colorTex),
+                                  SMAATexture2D(blendTex)
+                                  #if SMAA_REPROJECTION
+                                  , SMAATexture2D(velocityTex)
+                                  #endif
+                                  ) {
+    // Fetch the blending weights for current pixel:
+    float4 a;
+    a.x = SMAASample(blendTex, offset.xy).a; // Right
+    a.y = SMAASample(blendTex, offset.zw).g; // Top
+    a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left
+
+    // Is there any blending weight with a value greater than 0.0?
+    SMAA_BRANCH
+    if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) {
+        float4 color = SMAASampleLevelZero(colorTex, texcoord);
+
+        #if SMAA_REPROJECTION
+        float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    } else {
+        bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical)
+
+        // Calculate the blending offsets:
+        float4 blendingOffset = float4(0.0, a.y, 0.0, a.w);
+        float2 blendingWeight = a.yw;
+        SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0));
+        SMAAMovc(bool2(h, h), blendingWeight, a.xz);
+        blendingWeight /= dot(blendingWeight, float2(1.0, 1.0));
+
+        // Calculate the texture coordinates:
+        float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy);
+
+        // We exploit bilinear filtering to mix current pixel with the chosen
+        // neighbor:
+        float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy);
+        color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw);
+
+        #if SMAA_REPROJECTION
+        // Antialias velocity for proper reprojection in a later stage:
+        float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy));
+        velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw));
+
+        // Pack velocity into the alpha channel:
+        color.a = sqrt(5.0 * length(velocity));
+        #endif
+
+        return color;
+    }
+}
+
+//-----------------------------------------------------------------------------
+// Temporal Resolve Pixel Shader (Optional Pass)
+
+float4 SMAAResolvePS(float2 texcoord,
+                     SMAATexture2D(currentColorTex),
+                     SMAATexture2D(previousColorTex)
+                     #if SMAA_REPROJECTION
+                     , SMAATexture2D(velocityTex)
+                     #endif
+                     ) {
+    #if SMAA_REPROJECTION
+    // Velocity is assumed to be calculated for motion blur, so we need to
+    // inverse it for reprojection:
+    float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg);
+
+    // Fetch current pixel:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+
+    // Reproject current coordinates and fetch previous pixel:
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity);
+
+    // Attenuate the previous pixel if the velocity is different:
+    float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0;
+    float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE);
+
+    // Blend the pixels according to the calculated weight:
+    return lerp(current, previous, weight);
+    #else
+    // Just blend the pixels:
+    float4 current = SMAASamplePoint(currentColorTex, texcoord);
+    float4 previous = SMAASamplePoint(previousColorTex, texcoord);
+    return lerp(current, previous, 0.5);
+    #endif
+}
+
+//-----------------------------------------------------------------------------
+// Separate Multisamples Pixel Shader (Optional Pass)
+
+#ifdef SMAALoad
+void SMAASeparatePS(float4 position,
+                    float2 texcoord,
+                    out float4 target0,
+                    out float4 target1,
+                    SMAATexture2DMS2(colorTexMS)) {
+    int2 pos = int2(position.xy);
+    target0 = SMAALoad(colorTexMS, pos, 0);
+    target1 = SMAALoad(colorTexMS, pos, 1);
+}
+#endif
+
+//-----------------------------------------------------------------------------
+#endif // SMAA_INCLUDE_PS
+
+layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput;
+
+layout(binding = 1, set = 2) uniform sampler2D inputImg;
+layout(binding = 3, set = 2) uniform sampler2D samplerBlend;
+layout( binding = 2 ) uniform invResolution
+{
+    vec2 invResolution_data;
+};
+
+void main() {  
+  vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4);
+  for(int i = 0; i < 4; i++)
+  {
+      for(int j = 0; j < 4; j++)
+      {
+          ivec2 texelCoord = ivec2(loc.x + i, loc.y + j);
+          vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data;
+          vec2 pixCoord;
+          vec4 offset;
+
+          SMAANeighborhoodBlendingVS(coord, offset);
+
+          vec4 oColor  = SMAANeighborhoodBlendingPS(coord, offset, inputImg, samplerBlend);
+
+          imageStore(imgOutput,  texelCoord, oColor);
+      }
+  }
+}
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv
new file mode 100644
index 0000000000000000000000000000000000000000..fa0208f25069dbd07bff6133f52792e1e769f681
GIT binary patch
literal 8328
zcmaKw37D2u6~|wgjWv~3P+3Jl22?~4QD9`onK2NAQ7FstV;Ep;n2&~Gva&>Tt0!%<
z#Wvel)T}hJGRth=7u(DB=dmo?+)C~Dd*>c`c^{wd)A9b#|19_1bMJk>p_<{d8?u^V
z*{EzpcJ{C=A4g=vGvVm0KA&6HZd$u&&%m}tOHMgahi7Fqxt~6V;Zw(4i|nfO^l5ky
zIUHW@h#aeDd9E5_hl6h+xDJnETpcy&w`pBd)4J+l*Sx{4i&qS89};-|$1-<S`?fKm
z*HKHB4OaSlDqDNIhqzj5lIz_oVa}<{g`7*sIV8EgHk-^`%y}ewIp>s|i=3mg>FDM>
zr@Lpz&aKt{oz?2LGkd%Hw)OPwIDgZ;p{kdx$ZI?X-+}H+x~kRwRCjGQ4X(O<`=0Ir
zYaE-+|3CRQ9L%?8@kuL`Jnx)m$a~x&dF!&d=v~#`YJW%fCBQsx9y9Ig+x9e9o2`V~
zUFomv%6)3Hrjp)V($|&r4JCckA$on*iN3C-V`KZ8&du%39c`%8QlC{0@@*~ct!*7S
zUz1(Ttj#mOnmOM?vR%i#GspGWjm-I8===Jo`&pmeRNyyzMtPrWv;FYJecoKsZ!PJ!
zm-IVI`hi3A`s^O`sAo+&aQAQ@d~pvCp!Zh0*x!i!z3k}iZ0)UXt@O6{4OY7<13lHg
zc6(mOdwGJnC*cO(jlJ>i9}ebTb@!%)TxS|wTr~T8EP7Ah;KuGf)!w~kCAOY5<sFCE
zI=58@Dnn}?m+a<T06utEYqLi5Vh<OU^po^o<E0y_b1`~P-|oEw?YnlkrgfbRx2Lje
zcW-xp{%nT$+H6TF*V04ux@?8HcI{ZdcYu5;S2LV%v2(-5mQ5`koom-`W{rldrNpmm
zX>LD{i>vLdlC6!qH&jDI)>^Wix3*(r6UJ70;XTF-8qzCC_Zz7FLFUoO(MTIBMW1J*
zJ6btUIlqDW+vi!i<)A(jJ?!>Qdr{X8oyXh9qFr6PUug5&FYQHLyF<IZ<A3>3w`b~#
zx_uJI`u0XW^0lM5U%!vz@zQT5R_)>m{I{NYGPxR<#qu-IM=;ANlZY6Z^FzL)!0O@a
z8v4o4B+Hm&Glw?CHO4W!hUeoN<C%?<*FJ&SykVaR)-G?}NzATkoHk`9=jeILyN=ux
zvdm;Q7cDqsgnkrznuSF_Ynh$AhPa08SajEooZ6L%_%2|cfjAZ<y>Up*b1ej0w|h1B
z;-p(o_%8(yE9hIm?jh_KfITO5<1R$9Y~QNIN%yX2*+b2tS91ND=J3ydaN!!M#C>c>
z%ptFc!?rK!av_oD3b_6E&Ry0>pFQ8Z3O=6MpV94ywYt~8FuO1L6-oEq$!Y&<!p6j2
z{|@%d?d?CAXCV5U|1oAcdt>~6neDBd@y<^oj?n7}jyrT1Sby`xo@San%T~pnW}(X|
z)7XjM0Qc&8bHJ`G=YG`vE<`<Z6V_k5cf{|4BXr-r!+W_L{7m=Kd{Ogh$zNY{sQayO
z#2#0pOL_kGz;A^kbiWlv-ET!v_gmpGpJ&mA?%#Kf+*;4hZ-<=XU0urgM~$fYc|Qu{
zx78X)Ge3aXYjw}b-^QZeK(5d|7k?u|H{RdJ(0#}LwuP?U-?q@T``cF3{cQ_fe}At+
z*WceO^(LzJ9{vsSZu)l5NPT!H@mJ!r0Q?U`&O3T3KKA-wh<xPvH`qM(De^oFmXAD-
zfaQvL9z~bmz*^qV#}RYOS6RzD^&dojB^|fF>SBBAy?=}|c#7GxaDJG$VeE4_;?T}N
zA#%I3^|^;yusLpHZNF3X=yJZD$MI43f|<+QV%PNyW8D$S=Ipwom`5WH*Bz7W&av)T
zusLGgXMyEnUG<TOxm;IlpZ$g&0k#KmUcRGoaPpCVJlK8NKkXCH<t`_O`b4lX@_nfn
zlfd$^&SbFb#CaNbB%FM#GX?BEVx6gAxmf2Yurczn&e34`IPYV?t}SnGeGhW3;eD76
zmW#VE11#qn-i_H{xmk#{&t;y6_<iz>%%k7@#9y1@jz>3E-gSL<a^b%aET_#kr@ow9
z;`=`VJe^tI_h*d1FDEA57~idW<U0i%`FxY=5w{Gizj3}#_2vBaVa`**^O=p&=6BU^
zslL8-?dI`2c{<`ZQr+I20k(Is&PuQ`UD);2F6TY<&U)9J_4j?O0(-vu=$U#GSZ+1q
z`r71-k9XjmRUgi5KIh1HCfIu!`PQJzMLunE#>ad1?LCnV%T7h&`|CH_F$P&i9sZ3q
zHetVc{+nA~?7y|G!++;{KNd1CLiCR}<l8?X@sHS7u)}YF{E3L1dBo=NEuM^+Kl1oS
zmLxuWPC>UGee9XKKJGDcI!A5Gz}6P;&e~2z<gM*AW{0)OFGu98O&sg409#w+v9{9@
z`S3XdY;F2jo4P*M7CD{E{htcwn1tBp@ywGGp2&P;0Z##2+fmH^CVEyYk@!vWjG9V%
zb3xaCT|w7>LrLG1bnn?X@G8W5*0YN1tVZM%2UjZJuXU_O;(eS6wszzEW~@Qv6i38a
zueOM50lUwLI}0qQI3mt{X<JVI;moawciB43sok8e-IlmGt9Ep2^B$ec{qsD}M)Y%d
ze)`Bo%(KD9c)k&{7Hmw^q>o(m?i_Hjck9vR6l?ZxT-WkUv_%ij1s8j_5nWCx#(8$y
zBCZ2m?A>|ja!N7IGu9TpJ0EQCtiznzqj#GV7kzjRy0vXVq7Tmn>*sJk`p8Ah^T5Vj
zfJDsm!Nx@I^pPvSkLhrZqmlRy9h0!{V;bU{n8EDtk!N`kav{=*1h+MD73K}xnF-8Y
z$VG@YeYLG8gSKsmcEw?x4(I624zRUE%ucYJ;wZ+9NxwloiT9i58ru=$?3;VhUhILj
z%tsu@BGH363EP9?5}wUGw}9t??alGbelPUB7-=ZzR~Ph!;Oh#y{x=qMzZW+ZbiY;m
z3%dR{m-Jf;`Uvpt1>Jmil=K4y-F)|y^!t+T8O#E|0CAu3271BvBfcNIz;cSiGue{P
zzYlD@KE97`B<^Pwti1+_d%PQL{~8g`OI^RH<%Qs4E&b@lTK1sJDXwSksAT|bygubx
z_QGkm_NZkLY%NO>Yf;xPYIzYjY8eH-1o6(qdM^g+BOmKU&t3wjU2$lSbzcg0-Qrum
z6x|qywdf-kF)srfvjvI%z8q|fdoo5Jxj4tmz{cE=-t^^QxqZk1xO(PSAo5or{+?gO
z?C|$o+m*<x5#N9~dUFlfccN}@u0@Kyc_q3&^3fY>^sQ)%8eavLdlE0P>$qO*`+Bf@
z@V*!`lKC}=eE7T;tdG43pVxuq$8b-Lc|BsEJWu1bNBkSW#(NJU{sypo_`DIUk7pJ>
zZvxB58gB;28pdnCGxhl`U~ACFKKPB0i~Mf|muq+%oP7AaJ^2)Ccn6$(e2?D=_6-{A
ztljne25GbQ3C!<8TrckHyAvnx9(-5E`K{7socHBDh_#3Q-je>llK%drTi-bF2M}`@
zr?2+d-v`0gR6Oesq05KQhm%k7tUm%LA3gXe*f{sE&Dp)|XLkMQ!7X6>qU~m8b@8pt
z=7`_+kAbHm#_Oj&a@>|U<IJTVz4<t}*sD(@-G6H~fIo@oBX9iei8Fo__*01e*LFwJ
zW8F`q+p9RgJHc{_!#8K#0cLHnm(PF?M_kLC+GD-XCeAq5RS(<ez;f{|`8-(eE+l;J
z2J7Sgtp5v$eoFj1_lw}W5pA)KT<q&hVEY(<bKDEQ5a}#%Urt<wInMkmV12det1Wu-
zRj_u&F${_PUjvu(e?4)<{NI4nS6ey%H{rA^4)b4@&hT4c?eWe2HrRR`!F>lTXIyaK
z1sm%K?t5T4{bJwW2b<Fz#^@90@dNN2B>uMgAy`gvn8)E9e_Q=1>5G{?3-cT2S>3~|
zJ>KGv!8aj|h-;2y{s|(lK8yLM$lRp+-T4_}o;aJICr;jBE%E#M3$XF}c%Oa8*K@bb
z^Gig#>si}<iHq8Pg>G$2k*Mw0VEL%+esHn2-=G(3`z^Yh;`%j6)b=~@{fIv0+I|nG
q-Su3{djEh}<K?OTKPFD!s9!Gp{{+_G-$k*%i&rr}MTh+V9{vYTn-BW{

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs b/Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs
new file mode 100644
index 0000000000..a5f060f1bc
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs
@@ -0,0 +1,15 @@
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Vulkan.Effects
+{
+    [StructLayout(LayoutKind.Sequential, Pack = 4)]
+    internal struct SmaaConstants
+    {
+        public int QualityLow;
+        public int QualityMedium;
+        public int QualityHigh;
+        public int QualityUltra;
+        public float Width;
+        public float Height;
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs b/Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs
new file mode 100644
index 0000000000..4dcdaa646c
--- /dev/null
+++ b/Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs
@@ -0,0 +1,314 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.Shader;
+using Ryujinx.Graphics.Shader.Translation;
+using Silk.NET.Vulkan;
+using System;
+using Format = Ryujinx.Graphics.GAL.Format;
+
+namespace Ryujinx.Graphics.Vulkan.Effects
+{
+    internal partial class SmaaPostProcessingEffect : IPostProcessingEffect
+    {
+        public const int AreaWidth = 160;
+        public const int AreaHeight = 560;
+        public const int SearchWidth = 64;
+        public const int SearchHeight = 16;
+
+        private readonly VulkanRenderer _renderer;
+        private ISampler _samplerLinear;
+        private SmaaConstants _specConstants;
+        private ShaderCollection _edgeProgram;
+        private ShaderCollection _blendProgram;
+        private ShaderCollection _neighbourProgram;
+
+        private PipelineHelperShader _pipeline;
+
+        private TextureView _outputTexture;
+        private TextureView _edgeOutputTexture;
+        private TextureView _blendOutputTexture;
+        private TextureView _areaTexture;
+        private TextureView _searchTexture;
+        private Device _device;
+        private bool _recreatePipelines;
+        private int _quality;
+
+        public SmaaPostProcessingEffect(VulkanRenderer renderer, Device device, int quality)
+        {
+            _device = device;
+            _renderer = renderer;
+            _quality = quality;
+
+            Initialize();
+        }
+
+        public int Quality
+        {
+            get => _quality;
+            set
+            {
+                _quality = value;
+
+                _recreatePipelines = true;
+            }
+        }
+
+        public void Dispose()
+        {
+            DeletePipelines();
+            _samplerLinear?.Dispose();
+            _outputTexture?.Dispose();
+            _edgeOutputTexture?.Dispose();
+            _blendOutputTexture?.Dispose();
+            _areaTexture?.Dispose();
+            _searchTexture?.Dispose();
+        }
+
+        private unsafe void RecreateShaders(int width, int height)
+        {
+            _recreatePipelines = false;
+
+            DeletePipelines();
+            _pipeline = new PipelineHelperShader(_renderer, _device);
+
+            _pipeline.Initialize();
+
+            var edgeShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv");
+            var blendShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv");
+            var neighbourShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv");
+
+            var edgeBindings = new ShaderBindings(
+                new[] { 2 },
+                Array.Empty<int>(),
+                new[] { 1 },
+                new[] { 0 });
+
+            var blendBindings = new ShaderBindings(
+                new[] { 2 },
+                Array.Empty<int>(),
+                new[] { 1, 3, 4 },
+                new[] { 0 });
+
+            var neighbourBindings = new ShaderBindings(
+                new[] { 2 },
+                Array.Empty<int>(),
+                new[] { 1, 3 },
+                new[] { 0 });
+
+            _samplerLinear = _renderer.CreateSampler(GAL.SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear));
+
+            _specConstants = new SmaaConstants()
+            {
+                Width = width,
+                Height = height,
+                QualityLow = Quality == 0 ? 1 : 0,
+                QualityMedium = Quality == 1 ? 1 : 0,
+                QualityHigh = Quality == 2 ? 1 : 0,
+                QualityUltra = Quality == 3 ? 1 : 0,
+            };
+
+            var specInfo = new SpecDescription(
+                (0, SpecConstType.Int32),
+                (1, SpecConstType.Int32),
+                (2, SpecConstType.Int32),
+                (3, SpecConstType.Int32),
+                (4, SpecConstType.Float32),
+                (5, SpecConstType.Float32));
+
+            _edgeProgram = _renderer.CreateProgramWithMinimalLayout(new[]
+            {
+                new ShaderSource(edgeShader, edgeBindings, ShaderStage.Compute, TargetLanguage.Spirv)
+            }, new[] { specInfo });
+
+            _blendProgram = _renderer.CreateProgramWithMinimalLayout(new[]
+            {
+                new ShaderSource(blendShader, blendBindings, ShaderStage.Compute, TargetLanguage.Spirv)
+            }, new[] { specInfo });
+
+            _neighbourProgram = _renderer.CreateProgramWithMinimalLayout(new[]
+            {
+                new ShaderSource(neighbourShader, neighbourBindings, ShaderStage.Compute, TargetLanguage.Spirv)
+            }, new[] { specInfo });
+        }
+
+        public void DeletePipelines()
+        {
+            _pipeline?.Dispose();
+            _edgeProgram?.Dispose();
+            _blendProgram?.Dispose();
+            _neighbourProgram?.Dispose();
+        }
+
+        private void Initialize()
+        {
+            var areaInfo = new TextureCreateInfo(AreaWidth,
+                AreaHeight,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                Format.R8G8Unorm,
+                DepthStencilMode.Depth,
+                Target.Texture2D,
+                SwizzleComponent.Red,
+                SwizzleComponent.Green,
+                SwizzleComponent.Blue,
+                SwizzleComponent.Alpha);
+
+            var searchInfo = new TextureCreateInfo(SearchWidth,
+                SearchHeight,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                Format.R8Unorm,
+                DepthStencilMode.Depth,
+                Target.Texture2D,
+                SwizzleComponent.Red,
+                SwizzleComponent.Green,
+                SwizzleComponent.Blue,
+                SwizzleComponent.Alpha);
+
+            var areaTexture = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin");
+            var searchTexture = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin");
+
+            _areaTexture = _renderer.CreateTexture(areaInfo, 1) as TextureView;
+            _searchTexture = _renderer.CreateTexture(searchInfo, 1) as TextureView;
+
+            _areaTexture.SetData(areaTexture);
+            _searchTexture.SetData(searchTexture);
+        }
+
+        public TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height)
+        {
+            if (_recreatePipelines || _outputTexture == null || _outputTexture.Info.Width != view.Width || _outputTexture.Info.Height != view.Height)
+            {
+                RecreateShaders(view.Width, view.Height);
+                _outputTexture?.Dispose();
+                _edgeOutputTexture?.Dispose();
+                _blendOutputTexture?.Dispose();
+
+                var info = view.Info;
+
+                if (view.Info.Format.IsBgr())
+                {
+                    info = new TextureCreateInfo(info.Width,
+                        info.Height,
+                        info.Depth,
+                        info.Levels,
+                        info.Samples,
+                        info.BlockWidth,
+                        info.BlockHeight,
+                        info.BytesPerPixel,
+                        info.Format,
+                        info.DepthStencilMode,
+                        info.Target,
+                        info.SwizzleB,
+                        info.SwizzleG,
+                        info.SwizzleR,
+                        info.SwizzleA);
+                }
+
+                _outputTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView;
+                _edgeOutputTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView;
+                _blendOutputTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView;
+            }
+
+            Span<GAL.Viewport> viewports = stackalloc GAL.Viewport[1];
+
+            viewports[0] = new GAL.Viewport(
+                new Rectangle<float>(0, 0, view.Width, view.Height),
+                ViewportSwizzle.PositiveX,
+                ViewportSwizzle.PositiveY,
+                ViewportSwizzle.PositiveZ,
+                ViewportSwizzle.PositiveW,
+                0f,
+                1f);
+
+            Span<Rectangle<int>> scissors = stackalloc Rectangle<int>[1];
+
+            scissors[0] = new Rectangle<int>(0, 0, view.Width, view.Height);
+
+            _renderer.HelperShader.Clear(_renderer,
+                _edgeOutputTexture.GetImageView(),
+                new float[] { 0, 0, 0, 1 },
+                (uint)(ColorComponentFlags.RBit | ColorComponentFlags.GBit | ColorComponentFlags.BBit | ColorComponentFlags.ABit),
+                view.Width,
+                view.Height,
+                _edgeOutputTexture.VkFormat,
+                ComponentType.UnsignedInteger,
+                scissors[0]);
+
+            _renderer.HelperShader.Clear(_renderer,
+                _blendOutputTexture.GetImageView(),
+                new float[] { 0, 0, 0, 1 },
+                (uint)(ColorComponentFlags.RBit | ColorComponentFlags.GBit | ColorComponentFlags.BBit | ColorComponentFlags.ABit),
+                view.Width,
+                view.Height,
+                _blendOutputTexture.VkFormat,
+                ComponentType.UnsignedInteger,
+                scissors[0]);
+
+            _renderer.Pipeline.TextureBarrier();
+
+            var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize);
+            var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize);
+
+            // Edge pass
+            _pipeline.SetCommandBuffer(cbs);
+            _pipeline.SetProgram(_edgeProgram);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear);
+            _pipeline.Specialize(_specConstants);
+
+            ReadOnlySpan<float> resolutionBuffer = stackalloc float[] { view.Width, view.Height };
+            int rangeSize = resolutionBuffer.Length * sizeof(float);
+            var bufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, rangeSize, false);
+
+            _renderer.BufferManager.SetData(bufferHandle, 0, resolutionBuffer);
+            var bufferRanges = new BufferRange(bufferHandle, 0, rangeSize);
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) });
+            _pipeline.SetScissors(scissors);
+            _pipeline.SetViewports(viewports, false);
+            _pipeline.SetImage(0, _edgeOutputTexture, GAL.Format.R8G8B8A8Unorm);
+            _pipeline.DispatchCompute(dispatchX, dispatchY, 1);
+            _pipeline.ComputeBarrier();
+
+            // Blend pass
+            _pipeline.SetCommandBuffer(cbs);
+            _pipeline.SetProgram(_blendProgram);
+            _pipeline.Specialize(_specConstants);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, _edgeOutputTexture, _samplerLinear);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 3, _areaTexture, _samplerLinear);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 4, _searchTexture, _samplerLinear);
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) });
+            _pipeline.SetScissors(scissors);
+            _pipeline.SetViewports(viewports, false);
+            _pipeline.SetImage(0, _blendOutputTexture, GAL.Format.R8G8B8A8Unorm);
+            _pipeline.DispatchCompute(dispatchX, dispatchY, 1);
+            _pipeline.ComputeBarrier();
+
+            // Neighbour pass
+            _pipeline.SetCommandBuffer(cbs);
+            _pipeline.SetProgram(_neighbourProgram);
+            _pipeline.Specialize(_specConstants);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 3, _blendOutputTexture, _samplerLinear);
+            _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear);
+            _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) });
+            _pipeline.SetScissors(scissors);
+            _pipeline.SetViewports(viewports, false);
+            _pipeline.SetImage(0, _outputTexture, GAL.Format.R8G8B8A8Unorm);
+            _pipeline.DispatchCompute(dispatchX, dispatchY, 1);
+            _pipeline.ComputeBarrier();
+
+            _pipeline.Finish();
+
+            _renderer.BufferManager.Delete(bufferHandle);
+
+            return _outputTexture;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin b/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f4a7a1b417766c12bbac4e4bdc56796f18538bd6
GIT binary patch
literal 179200
zcmdSChkqN_mHs{GL?MVqfW3FHfnW!V2!g%$UL;B)B~rcj*s?5HvMgIJaxaPFIB^oE
z$4Q**W;dH`Nwy@L&2IMd`(NJY+?hck0nA9^T7EGfTO<ZE!~4Z^&bf2v-g7uk;*~A2
zUt;%T9?tD)?P&G0|NLS9($A5<2QGCi`6oL@`~&_ze{V;RzuVu1Ex@+Uj!wU@sjua=
zx?8JTy{&bv4Xur>P1u^T`PkOdDsFqO$-AyZ^Es}aGe~|W&IcazaIQlpzkNAFWxdq_
zZ=fzv-`U`A>}>Kk`J1u%XzTF#TlkGzGW9t-GduG36t$POm$z58RkeBAYTCTmYTN3t
z)weZZlY8AbP3LzV1VNSLr(%BKF%Or`?}%~6Ig;6**IU$6+U*Wh1gZkn0Z*W&)7x3w
zS?jOstoPUZ8~hF2hITS>z|oT$$n4DN$nPlHQ@p3N-3^iz?Un6S?bYqojOVWLJIO6_
z6`Ud_Ac-B1c{s=0F4<C~y>9>gq;@57+A)$kl-Zxtm)}#^UEE#TRn}Dwl9hp~&T69B
z37Xsy^@4uPJZSHAb*F)0wm;8b&{5b?yr*PO>7KG6Nx9dRXg<fC=MHndT%P3TVtn8+
z50|aKGphaidGn-w%r%@gnAw-zm)A=ai@Hm?z%k$s@FXj_<H}Xtv~e_P(AMYdN$F1S
z$_!+8<_Z)^jXcL))89qz1UJVubLlYvNxc1-hs)-7PPw67G0rAU+QyPcQU=opGW)ap
za(e}eUBz7jNA9F@zh)s}(lnYhWb1eIx_VN1hJhS^kRsH$Yy2*AXSg-4pR4Aa5~!l?
ze$2yV^E<EH)a*-GFwGD_$FOTCl`+if%jpHh?t*U8VsTe7cUo~!y{wxtOeBt2hHL|l
zK4))AcUl+eFe{M5l$Z~T-bH?7g42j1C7eAfqPTBfW)9$y50}mFg5rq!fNt3^pE%73
zjyQ)?2Ga&ehgp5u#1QtJ-(AR^Q5;h3)y^kO8^;BL{Z6LBw5|-sFq=`_HGWsP3*1p|
znH%65xO~nM7gm&=k9j!wg~tssj_}bV)Gze6UQ`@Yt!q~kmW*?W)0PSAm}4}VsgN=3
z1w&qm+*!q@YE84Go7GR6#?8>6b<p0Q+~?{|VU{d1lzZJMO^$r)3U`q^$?fMRIX`ST
zn=?o5mip~650}mFlH!E&kY-J{qF*r0nx{Z;+zx`I!qmaE{)~alKE{wc$89MOs8^uC
z8N;M$EJP4?OqL992!>s|#_uY3i95|5;ufGlALm9KNt9w1zW*@~m(LG@VN<<dyEkFU
zFmIX;5`-PQhEfL7uo1&d@H?tJsNSnt)XnLqjT4DuNh6kFXmG&MpWG{`5Da&b-(~JB
zcZA!^O>hCOjw|L;Ia4^H_&;Ce9_w(~{4R5+lv}EUP~ftD(Ku_G5eYg*V8??@g~Sm2
zjwv@_!OKj6Qy@4RMv$TjRG8YmtNd_H?f`l*gIpVII3JPN7@t~bFY$Yf!ykg*Y2{JX
zhGswcK*PLoHZiC{c)}sFW5y8S>^LIGe)X!BS0KD#h#)+npu%0`cLmox!EGRhjB{OF
z6IagVaLID-im!5yb+~MPR}ff_DG#X+X!ns1G|rpC2-=}RiY8!4ZBZOnkp(Y7fwOv2
z;JA4-X_#3t#S%mlXYa1^yTF~|Ho1M=3>3JBtK~|$Ec9jcA(h4ciTFLn;Sa&@tm3%x
zuxeeiMhXPMInzwylwidph$UbM72=OMq1b{4+^1P#3Y;+z!7(CeB|pe}!v0<5cM*N)
zqp;uwZj9?j6shKl5JzlWLTqm09^&^Hhd&g*6Yzj!!F$OE!iGW641(5i>zHlKK1!Z2
z8T?Kvj*th01uy9qpulP4q$!M`jS-a1FB?7MgXp20M>8*DyMpa%*mi}R<&MLJT|gX;
zwqFDl<KJwF^GofQ+m0*TImJok5!E5}K`nYQ2^2+O!$dG?(mG}xX9P!y-zo4rq&ff#
zUJfd7GLa&&pg|%i_qu4d0R$Iuk8uJjJCE~U3?oJK0^Ah04*i{sx<3Xt@okL%#XY~&
ze!1;{mj%D0%z_a^-~$&8<OOG_FO!7cAWzWF_#Fj5-hxF1GB1cY!V?^_4a&VPYo`h$
zRK19=YoJJ+PNT<iPGET)^l}h)N4fc^`$bSO{$=BTan3KbU+#U>Rq#6new(o11F+x~
z){kK}j7U5Q4e|uR?=*Ta6hYQBt5Dzq`hgThreMPqi3NgkuPZcLff1@^j1o*E%I-&R
z`3QO}ClEExfa5yoCF1Tk`u-T)#J7d<znJHj+Ap^q;D@1sW8?wV2Q>SLpnj3!F!co~
zjtB%9zvGHc<vRK>dl5r;1yU3dy&xkf_qt*`@ozffLObp<CJ`t0B3f)9<{riMjzW7S
z7(tD`KkNk2eheG`i+a4&{@f+ghW3d1gz}{FwBn57tl}KLQL+v4G@<RHe~)X*w3={G
zb69;ubxe6ec}j6gvHb`->XN)o?sX&D>;$I<+~*G9USkeQ+y{!6V9)bx{7->?^UnPd
zC)oK*#Q0y7<E8ey`<xSr3;I>ve$Bf2kZM!41tUGCJgzu_?Iak15-jq}jI*LV-Dm4c
z95PNM%;*+0%bLCFHT412x^hE#NV%!pQXEknRUDH?kGmxIy4zZ8$K6FWdME+(^v2QO
zTR_YWAOG9Af7=PR{~CEXdX3x8FSXy(nK|GXGfx{A5|(v)wQHIK>UH&o>X7O%NFKq)
z(>%V{<V!Cp$*gfUTRY4>rUAo<enK~;oz={17B$Q2RW)d;_A3v{<tO*L+uMr~s%(rB
z)WPR<f#DeVMUMY%-@oGok#B52eEa#O_Io?>x>E=3qe+vd8N+<Sl5R!2SF@(suRe&4
zI1)*yYhBl6tH~@X%BoDMbu?SslLDq5<A7mEKc*W8#~BSHsa}%HPwsUiTg!ek!_yT*
zg-y8E=w`PMk>h`n`=gv7{H-YCe-Xz^?XTNY9LVf-4cbPNCK9KOv-<gjMcuM?6%_Yr
zV4sZRx@u!nGnD9aR%R6zX1mio&U$;Zr7g*y*k$T5^cx2CBRb+ZshyU~PwsWY+Zy~i
z5o0X*xX-PHB}W^;3*R3+DEbf44o5T(o?mKzLwjjwZg*OrbI3Mo8BZh?&M}5d+La*1
z{ptfp)f0veYi)`<yPzPuG`-U0O>VF?TUwL$BzBs*OudZaux>;yKe^Y9-fE0cInnDY
zg$kn&;Dz?bIDz<f^ut5vm)hUhUfz)($m~h&cMRD^EaT=$)3h<j5K1I_h8B;jW)phM
z%??jmDG272WK^VjoV5;~VY}IH3YfZ$(4t&^a<414bH~3%jIm_lJ~!3?p16O<fst>-
zIvgWl?EF&uo7yWoiaK+;(|cV5jNq7gB5}${mOQ6lNLbJ<F@~!rlnc5+Q@gd^S(#p%
zQ;?rilIc#Za@9ENY{al7somUR@|(Kk@{@br$ks;uW<az|XZ)fM;PLw-4%zujobf+)
zywv`twyHfP{=6<Iu+IsC))C7XJmI8i3Mv#C!XutiE^9^%fuv@;H>Dz@I43_pr#Q1L
zts<q`>2=iE8X3d3qz>8qWL|e?i^Ja0V~i!<03Pj^c*Q#$BVabE{mpHjJ*EEqKsFSZ
z(w{tNCxW9%<G9}^6$TleR_xPEB=nfttPRPYRCgvLSd>|oUY=UzgeBXFp)biNm!I70
zN;E%#@rylxCpm&ed}1HYB}naWZmVf`cNBK!c4c;_^}2}QFi#M6Oe!Qxh7Qjt_N%9L
z14e&Rv#r)ym0Ffr1P$gEWtOJ9Q!Amun&dhg7|Q1-@46EFIL7ZW2Jnbayu+pT``WyS
zAjST?Kz3IKY&eV{c|sz{Ecq-Ri_dC@3|)zBmIg<St0D~qbMx{*uq4BsTAo_%WR@(K
zpWN$8@_WJ`pmtQi0~Q2wm;zJC3z7zhVa4PL$&R7IbBcA<yk<<_YuW<^);X(F%F|0l
zg2kEcG^o&3?eshhKRMTZ!gwSv?`x}VuiR5g797YT1(FvO2wG{hi98`xcuujQT+)o|
z`VF0lt(Hcc7dGrpFA)f4!4rZYqKSNd@~$h{fG7NcmbSX~sy$^LMX+E}AfgC)!2vv{
zpyw4L!HIL+Vdb)VN;hce;uYxdI4e`#8O51JdAWJHg&;_t&@Gpr-0MDJJd&5Ec2u{!
z_Z0gJcm*Pgz=kP~gb2bDp651|tLho;2rSrdZh-=8VZ#-O#6?+!B0+e<hv6sZx{?ie
z!XId9s|UaGJtZB5EQUaVybW(7c%D0=+^e3|jKP8frgrjyjv8lms+%Hlh+wgNe)6vS
zgz-pTz6EoUJ%}Ksu;6@HFnk~>kZiagab(aYSTXTiQ_pKAV8Pv{j--~PMv5Y2!)b^k
z0>SJe`TXQvSF!<5_yfeR1`(tTeHgOfZ1RCUf(=s~8R7|^=Z-24s1`Mox?z2<vD4I+
z<g?Wyid4H`!xTq&f&~x5PtJ9pFdoUv6F)CzcDn@&<`tOUOK})pkm3j<c!4{nIH+7w
zPiseD!Cg?Gj}(X~Qk{aZL2)Fz0Dkac_{q7hWCNb?2ljDW?13OXfaV{qh8{y-!Qig=
zsQM}HY3@4r6#k=)eh%Nn_pjrTe*h23_Tr%!J={6Y9+F}B9}gVqk)!x%<vc#MhAYz}
z-Dp>)$K?E#pD-TD%MTi7v@5DLJjB>!4=_%E<|%sM!CHKlJxHQQNl(k<$L(>s%1u69
zmu5&kp_)-HDwh>|6>Exvigm>X9z1S>+7XcL$Qd%uYL_wOPgmYj98-i}nW)mksPmG4
z|C9bePx46Ow0=&zqS?nD2C+v$N<0Ws&?BVd3ZCX^L35(cV(X8aGL@&N7u#wQn~fa_
zJ-R{7h-N}Pt)5lQE0>hZ%2g%N+^^^;>v4`GPU;ufmG|Q@RQQ$Y+WeKDG#{z!cV+fF
zM$MDPISkulz!?ud@JLOyp*&2F)Ue_45hxy4@HQ)h-)e%-R+5=n<gB#Tnww4Sh5#rI
zXooaonsN1%dPY5~o>wg@{ncG~a5Z9{Vpm?qAZ_@SH~9xwxUSUS|D^W^@_JK;f{%}g
zA^-S@Q4DfCqLj_=pr$La##xk=RhZ&-c&znF&89Y^-_WJ+(e>*FwIjq)GpX+I(v=78
zW9-4zT-bvvy7Ic}knjyYX+Bcd?=0%c>~{^>&<^(aXo0n4Rf~s5f+C<sw1$rsGW2&?
zJ!EXQR;CnYXBVWEIxFp7YeQ0VVykJ7p%WB)iK1>;+fheX?sE;ZD?^1C?52l*{FT{*
zWwL|ESlK4k^4;0rS=yD;mo|_*BD7=HFdu9QJ%Zbh2f_zb2UW899Z`)ZbeL-$?z93B
zOfPX&BztVN7J(s@7%+50iyifyrQJEbX@l&_?9pG?l~;pT-knzd-<_q?-Id>)*_Sfp
z071)ydCD|noI^_%NeyJpyb?9C`5jZv=z2`eR*$nZy&xwiKcgho?W}Ut*y=2eNzLXK
zQ@g1H6gwK|%Fuq)D=+Kt0F_-?VtO8zogIIApcFZ&>4Q?|@2m(Ec4zmd54eULqd|hR
zMyL=B;Sc$iXl3)mgC@;@p*^YIUYSywkq?5I#i?bk3TL&=%NY8=u+7xbNLTL4?qgRT
zv5Mave`T5s!CzTw`XASBFvOc4D8)~5`k=)5{Z)bD?mXxqb<jC%AF+;;1{0@EvrL6x
zsH2vw2>ir;i}){3HPo-Z&SCZ>9yBEcOwAT=az$!MCL>ssR+dr@6~dBjbxDo5cj;)N
zD|hAfvMb}ei@5R}=63N{ruN5~p2uNsYP(7mdwQT0KgsEX66g0<2THs0yGaKrgUN`*
z!FIro$*T)3;Yo?y#DAmm+p8Ye^%&cd8f+e?JGEFKm|hAyu5?!0Vac|7;)g2-@_U3U
z4@X>ihSxtB^6if~J&!|oAM1Y=idfSFrT9rsACx%1za~)DRoI;i9i$Do29rbWfG316
zoF}gihO+sc#)GCQZNI^v*lekFRJqF1iZb(abMrEb(%}iA!s=wQ<c?;#GJJ<{<)Mfx
zlm7WDhuR-?dS2N2-x&KtCy4r!<n+Pd1i<L$_t$im2Z~@vqyw_yWXz@KHHcU;P0@t4
zL^i)O7^a!k4C%WtMAKlaaaN?1r59!9F@hPz7%yZhbPzwfayPp&^B!SW#xxS)%HfTU
zGChyO!?WP}->Cb;PY~s2$?1a<=l6Tz$4d|ib4Y>xDIn+w5u8k<c0db8+5FBb)|K;`
zQC*K=4-{But9F*BmIeuC!V{*t5lxUWA?(WNK}B475<Zx}a%7WtOwZ#Y=l@07A9;cu
zzeUdPi*kI#2X^lF*LGII1CkwO^`%1xE?AOdlr(4^PvT<<wL~_*bKHh<k$=!+YJ~#5
zjw*P;k_^&dUT#()2*MMlfM3{^5&t8ujNUwpVlz8dvhA4Ow{3#nj{Q4`MZU4^@a@z8
z==+J7U%37JXhjwL5bUTY8~s4ifeUMi>^#9xJ061HA$riH9fSq9n@NFq(1amE3=n3C
z1mOu8zpBo%F5$}XzF}9!dI*7FWNYcSi0OG8n?4XR|8Lv=ohOKRXXpIBZO8BUK;(X8
z&ZtKAg5s_MQebZe)&P<v5kcE1t?Qt6z!UQP#Q(>MzvuV|O_~v1ufcDkVg6bN*|0mc
zD6=p~Fa!K><xb(sJz0I*u1xBuD@Qa|_$6$59`W0;{x@QOv=fBA7v=Eq>3@>v_t%48
zIXqw?d2*(})WH<kFnMzj6x$(^i`1d`u6SRJ{vWOXjd;)m1>!*yY?vZ(kRbTs%7`HB
z%IL+A_Jdc(y^wI_u%?Fn7&g5x>IA*8{b5H%`!VX_q3M4#KP1Zf;?Vh#wZ(%7QVKgl
z3<>IhdBG8Ts2!2q;^+?#niTsmRbgB=sP8s*Fa=T+sdTwn9N`IOceI3G8F#{>0@;;G
z{XD^FZKn5v)B9pf&<pO5bpql27>CO~KXTsGfL|FhXykY2hIGJem?E*z4pC{59L0CT
z^hXbx_A3|EliFd#U_U)*N~%K?sdnN)6N@8-+3<tl7jb3up9BTcm2p2JTse00h3S1U
zC+NwV{s(!*TK_BW{BrYO?Uk9oN!=<l|CP+VQL*`PG(C@>e-&f>uZ-z`;1_%SublJC
zT?e!!H-FO?n*R#Z`{GW}lbrq+|8QCWD{+3g{AA~E`l9n+Nlnm;Ct&Q8JY3=fa`RBh
z&foL}=f9E~pOe6c@k5FA#S$lwt3TQKo4)A$R}$lM63qv^`0IbAj+e_%cK)U>I{%f#
z_?%SxrPlXK9WR%k?EFn%bp9)e@j0pXORevfI$kb6+4-Bk==@g_<8xB&4;~(CeX+y|
z<gTOV^Ea{FCGqn&ebM=^B*y0?njb!V$UVyXVcGoT=WqIA^Iu7f&q?u%IGo$L{#WXF
zx%?g?f72J6|4L$fPOAOe4(E0b0LDLqe10<XH+|9huO!Cj;<rA^@ZpXD#`tHD%kLrb
zH+`}Buc8d#J?{8l<N)Aq@spXq>5I>Q6?c3te(UM}Ez03s_yA!1GsxH9S!7U@mA~nW
z&VLm(f#2hf|AhtscZ;9w{7t)?|0@2UPXdZDfcLoLf5;m}>x(5`^7Z!+`I~k(|5f}J
z(D=yz(fA)fz!?AXa`{E(Z$dvhcK)VQirvnC6|V`zFXjT?#~uHZ&VVQP{_UaiH|>7@
ztJnl3#^+)m$nSZ~!;$}w6v(SE^zY96#aHnqpGF47EhL0N7F5bYDP#c@a<B;*u;^pR
z;Q4=%|0+6)r08$UeYZ;P2V;Cr?(>g>*M>Iw7JH_;#=3?BgPr}IeVx6XJ^mhlH#VvS
zBGdwzxmGz_JJitE)YIJM3-|&powNnZ6VTViwl}T~&GrrT^mX-gbp-;Qfll`F^REtn
zaEtM0hu+TSG@)M}{-+i0@xB1L`un=`bpF25xr&ME(VF4f!MgtXzJ}h$-o~D$ZrYl=
z*cR~RR~6QkHB>fvn!UbSUtLQBwnm|j3SUWuucdOXc*QeaH{LkXG}t`g>+|)t^tSY1
z>u&9C?P6P?l{QHtUgUFnj9L0`a4V1X1<2+13&l(N^VUt*zKq43+5D-(@shE!5%+NU
zP~~9NKy|;T&(l}a>+SL873P-}RhD_my_L1q^`3^B25)0+V_j2yQ+;y-w#Md0UlX>A
zX&VJAW%HF&o(b<*?MU5F{b0jD<3MA7Qy;e8<{oU_zHXl+zb38{<0g;%`Zjk5YoU~n
ze7xM}AHU_kro64YXgX%yNZyySoW77bn=_p^nLklDRy0~NTsl-X<Q^&?EYHi&D=sW6
zE-$NeS69?jdaG+awKa9#dT)JgLv2G{V|^n?V!Px#l6|0HrDVZ9TQOBN=^3vX^^VjI
z*A3MT)(_P8H}p4R14+q-wIG*R0cVx`T#OI=kb4j7p}dbQVY&MIJ@+a1y6UF(lHrtj
z%ern~bFQQ;rp;%}W=-cz<xb{}7mO8-7L63=<>eI?6c?41xJxVCRpr$co~oK^Z?%_E
z<T=*Y)z>v#PCk*ok+W8?QoL9?S3W}wt0z2THKR2nBFDM`c!?Oz4!zyVH6jyMChl1u
z>Gf0YTigvK!Yg^CqdD2}_usgma$n`{t8Qp7>(3aEnGahJ+Sig-T+69MaVB#*dn%ha
z;#H7eR9I3}3WDYCit?(8>Wb<rPc@@h<E^RnGK!ZSr&71F*7HEHXt89zY?c^SPE<`)
zk9)>EBSevLlx$WTawmGYLM{b&T#xYjIrly8RjhOD<#He4X!-p9$o&%E)LV)dRL^KG
zC!95$Fl{9rvL3XpIaZy^uBFt4w7K+|jOi@!%g=`b3rmUxg6@h6o?$i5(BrM~dc9Zd
zXHt%49LheBx36Fo1Q$x@%4T?mL=g-}JtN>K*{C*-EZB`G;*ggyJMxUb;K+iXLKJD`
zG9q`$`c1C={)ziL?)y;SYs%+U*ELsk=k%wH#}W@GZSV|}SDec!i>dQzj9)=PVPSDm
zDI@51S3rf8#IUlus=8WG;uZT@*YWhttc{!ld3!-{33dz>y2BWfJ$ob@LHruHDkv}=
zUQqgmW7`Qo=YGO{#Jzw?z~lHDQn&4u{#EY%+kbNZfGd5>z0KWM-c~&gg6H+8NrmP^
zV0gf`W?yxHp$jQ>@?pV+1x1C$#k>aHZrCwY7-9&DS8V5;CsL0<fd_N;=L-Z&7RqMJ
zX25W|a<X!QHtFtd8-&xu)o^7nz!Y@mgc<wNE$2V+Irka&3HJcQ5c^@n4<E|q^7}LQ
zJMQP)cc8%6759|4K=7*We8OqNN#jw|CK#>@48e~qm{(wkAXHdZQ69z+{E|<m98KTM
zgaUK-=dbb_q-at$6UI=ILp%62Ba>nU6qt#ebBUZ*@|ds>{F3___g(H??m0FMy#px)
zAFc?HTz>z>{S!Rk54dk|?{KdwUQpg7g4zoSXJE(2O<ReF%^Qqih@YUqk`nmA5_cI?
zSVk%o7+$fRPd=4$Jncy4p{#WfTniDTXcA-?*3{@fwj&d&kE=%}#Zu&mPDAre4@2;m
zP{sGTuW>J780;VtAvfb|kWAVZ_Ls=z_ZRM8xWDCo46T2adlNQ%7X+Uog1WP0$D~4H
zXc76rg5d+14TlkAcFY)FwOvR)<2sRcB>iwEq6oa;zWmkv)k65e;>8j{h0tNxpBnu~
zo?injxE$FOvth$lL}J<Zbid>%f_#H}9le;7+!EIZFPMjiNOE8D`TZUdgtYz<)c!#6
zqVkUNhMEXo)SXQ@tv_izX4*0xHV65U1ydA(4HuQ*g&rX^$VU?p<oTUUJ({+ad5CP7
z;s_(SOa%F80u@TOgZQ<e4_J#BLJG{l-G~KapE3#l3jBVIp2a)dU5w;!AQ^loq?|4L
zu2?R=|K|P`5#$%J;BRpsaBp$1zzg11J;MlI(w!qaW-8?Q@g5KaDT<I63@6B<$yM7$
z$64pelw-UCvkrnF;z&3_M3XtW{F)#e4|*|0Tn=p5f#&NuwIq9d;8z?);P0SKFC$06
zNp6)JfkUl;D~&pJea9)~^ZPgMxA1^J#<fU+Z*ceF1)mEM)PW#W$oTOQL{MN9f>0sj
zhX?`-<`qabOmQTfAXK>H58nO;>5n1^S#SkzBl6Mo6nMczc?5~yPjPScby)Con2m54
ziOu^E&`Z%Bl5<NVpWnYD0{;!7{iobF@ol}$y#^b;i%3iwyrzW)$rI{B{3r(V3WN<8
zGaC+BvHPm^qWx_0=@hbH^kVoZ!pD&af>2U6Zcm$FqpcvtZ|#_NCT_EMe-K+Q^14Uc
zzn%BC-+e}%3!bwc-vC`<$2EF%Nl8$(7ak(&C*<<`AMQU8fq#u?^#ktP__p3f6nPmo
z{5%L==Lu@h!xIMiQ6FZf0)qsvS})npA%ZXqW(s6cM2sW(tA)5T#)jr<K(G?_$+sY~
zm83n&r0wTA@DsIg`~Gdbw*PvJD^@CP2)b3m6?~EBxP~uEqk&f~gMXmEp$XAmyb5T`
z=l4hMpSizh-uAobM}Gj1`#Q7XJBY+Q!OI}HonJ7Dh+dEpEQS7v-&yb@3uX%Bqlg$s
z1cI!+R2iTEDxnrMpasongV>5e{7wr8tH3zg{_ViF{r(;bVdQaoBhHZzMpaN6U4uB$
zRa(Bz5kd55^em?~eWC`js$Td+v)uc)|KW(=KfnY26nEU8a3A8X@h$WOUxF9Bt$a2}
zP~=AuB&a|>ibN2+#`8l2IhhK6=tq+cQxriQ;Uh76geyhC=33#`3K1d1zh7+QAhcJ4
z);VMB51t_U51%4#%;MI?oS-zrZ{{E~JYB=XID*m-;Hi#)U?v50qMVJtq7<IC8XUn|
zF2BEm-@hP&{0e;-vS40;Z@`Ah3qGg3ArKV!Q3Mezm{%Z+BE`j}EE1zfNc_$xpG6;<
zw_sj@f(?st1O&rcorw0)UtcEt^|eeXy|COWw2OngkG4PT1krx{F_dr-g;{bqn^tE+
z&xg)YfN!K6dIlpJ-@1iD8SQwJjr1E4-O~Y_D7v{yZ(d1WVO{}lTJ!17OKd)tO0tdK
zVcR>J&BQTVS8}VXF{Li8CcQenDx(5hd8RwFEUPT56k7>4@j$76SM1%crZg{|CDYB0
zFUc-uTTymVPEk&wutl^v{BJL3Dyf0`TG3K|{4d)6h!gDm<!9(m+(W<54}uPj(V$L*
zv8E!kc|M{>DOXN+CvSl80tlAkYr+795_dRu_L)4cu%V=)uq3}Izc9Z*peT?enqjZc
zxfc|t)T;?o#=*p%B)_%I=Ce0B8j|aswN9_ACdHFdomz!WikrCqn040LZ|_L<IU8KH
zDK)9pX_aXe>E-F=8Sad-jIzv9Y$e#XwOEh4ixkkxW36*TiIL-fJNIup!S-J%9=?r7
z$cqRHIh<2%PSBZP$;2m<R7Sn)uW`5GC8@@3IRZ%%bu86jXyaU5Mq_z(iMyz@u(+V8
z01RnkS|p0$B!3H?KgXR@uB(@|QwgJn0aH(6z}%74W@)whY)!UCdxN7cxy~u_k^1+v
zb<H|u9kd1PdmO&xCTG2?*5wtx1(3v6p6*U}?;Mg#Km(G|N6uypOTaI3{BQgI?ZCGE
z{yF%3h-V<jFn-klf-bd1XE10IaTNzKq~77TxYrOxHc<4Y9a4u*aeJfSaF=c3tj$$5
z?#dE(aS7e5QirX8F$71v!btuBQR7YSCaOqnp|H`SW=1!rA2#$Gdr^F$)4V6CJ*hRR
z1tg`oiTlr44_TM3lhz??kFC?*Zf|ikB{w?jopmlj%_&tWRjHL}+nP(i>fq_@P+<m|
zlF#@>j{imOk8*<Ww`lx|_|W*(AOdR+m!h(2O?sV4WrjCQ7RP7qU}5PR#uwV*6!UR$
zGe1b7^VQT=RF##Nlogj2fnXt2xSb*+`EQ6DA9AnYF8d;$qHn17GKLenQT-rFR}jU3
z$)DJfB=V8^_q_E86u4lWK$71+TbHeaF>D6K27w|tx;!c2Z4Lg+zIiyKG}u_Q@xSo>
z!Gogz!1H+rwsi_N+>M}`!=<Y1T62Qls7X|q`SBTQ`O_HXU*?8Eum;W}17D#LnN?b9
z>#98Em5d;6QyD{^Ae0!U#XlnQeg}QN7rAHPO;2Ki!2uNQS<uXCCs8_Ogr{iiHc4?4
z_g}Ogx2{`PtaFTDzqK1G?69}lTOB?}lcSNR=&W@`Z#BMrBh=x55TcL&h4#ldf%vvK
zesv!Tq%z=q5$05tLz9$XH0Vq;KJz2oalHi_zl7|C3)~Pc=|zCf#usSjTI%XOHI-Eu
z-gIM#1O$tj3Tf<w*P%d>#_xXs1-_1Pit9KNO%hs1M%fi&sF_0XnNeg$9W;o1r2f5V
zJ!w4*1uj`<uu^*1+HdW(1#Eu%9%5+sIiN&FQEcaqe>HGINf1G-@jr3@jsqj#;K#2%
zMvT3QoRRyOg;s)1nlf2q(V6u8_`<ij_t8&$2A?>H-r@i*TFVB!Q@NJ<h8k~Gbwy=3
zLEe%h82%Z3XY!=);c3+!5X8K7<XAkUJcz~S%gD<MhS~`zQHq<m|C05Tb<29tx(XYf
z2Eh^QfVJ1!1%@3F3?o|`@tX#{=0t&CwDCWFf5ag>f8ockKEeIyJt*)9Y<Lg>se%!7
zs;%14_`(}#_f^>N8dTbk%hn+p=W{I$jo#X7Pi0kkMOk@BgSg=*9kLhG;h(vGgpz5T
z;w|nz;`!4ccoq|dwxGiOs#Wz8uS1Q<N9x~8)-%?l)(t3d**Y%}9JKZ!ngnc}%o{Qt
z#ve>qgJ8VzKiV(x;`w|Rqr$Ip&mu;w<5_SoZr#d3FkR_X{|a|ppP>)<F?`p3c>nXr
zKe~z%eto!bJvg^EHrCd8m;%cy+!*2%tQdA2vSiruf8p-!R~V`N7L@!tyx=VmL`_i?
zt3AS0xCU=X45he<`!8G1!UN(dvUM+Pcn*CT(jX(qROp8{Y=<pNG+!<7i#7fyIR;4l
zK11K)BlIrrqL*@<6$0<V?JYeLO;-}XpP}8K;Ew+R6!;ZPFWbO4^@KpMwXvzLzQzj$
zR#)Khgh;RigT_QKXvzP@{T}1@KSd1rI=+e5p!QpMYDxr8!HzLqO}SsW58hDZBlYiP
z>pANQSnxU&xME#IZ+Z&ZK8PSZA)*PXki4M;KaTNxtnokM^8-YPcLjYNh7FG(j?}}$
zWkWL*VSdEg{T7~*JPY4{Tqrf%i;K55HPzR9z121311n&|Zg;60k9_dr2?`AV0)Btb
zp0#`o1>(LJ-_}h=5Op!hj(HU-rMQXviQh?B@FD8~roef`5vqebf*#=j*)gxehk2AE
ziTeNI13bT4C=idgEAWVso>7Pdal0?P3je~!??1;k!H@nuz<2R{kl-0+$D7DSEb@{1
z_p<doBFHi8CR5-l6ganoU^n`O{}*lE&2zRkHP<)P)<J=uYNkN4;V^=%M+1oe&C&S%
zukftkQ#SU3@n?9!+t4#v@dZTolVryVDQ@EaE8uq;9&igWgcL|NJcl?E(jc?rN1j);
zn~dU}tF^hgp|K7Y?5W`u2rmd54kyU$_`kV-M+E*`<|{sd4}1?c{0h8ah~QcD;Ez89
zzYB=KCvX=+F=P$>m?iXNX2S@QC+z$G2S4(FWWi8i6|><8g6LJ37W3oxf5&>`H2(Zm
zj9NT^$9(~JJ~x;KFQPwliWBuE_3src@gono$rQMcDR7>}kqPDpQEVOr<9+vEhJCHg
zzD9~5wG=}{1@c}nq(KlQel&hhPy4=yUd-1KMc#xBzl1pQ9D3$t#TOBcrMQXviQif9
zI|d8JJT543FM5K!7o5U99Yqo%_+?-e?{5G<iooOnJwXNXJ(+MTE}`-Jf8l-u5BM3P
z{dbrG-(fcVB6>x)1cH|yg5O2(I|UC&7L3_5R#G6@FnPfc!FYfBmtkM4&xZ(tDWS}Q
z5kt_AVK3ebx-oTy`ZD16C-h)`3lB)Wp-%+`z6Be88D5YGUS}E<^(FQ173)RxU{1pW
z@)jIYAlWbx9A_H*GBAqwHxR$3`Uc*DDTeSqQ0&PB`_shl4|oFnYxcbPd+5b{13r+R
zV!jH3FM!}pMo@~IxIe^?x8QZ_ekgDiHoSoQIX;dM!FYfBmto)Qu*ZLZW&Rud+n>1q
z<o?Y47q-7}|IN1l;rQ*Z-2aog1#oxrIdu95obiu1<DY|P<j*Od@kR9Oe~)X@^=Oay
z{$DwM`(KXVz6^}w{S6z6JBS3|K~(r0+WHL?_D`Vsd)EGcNBjQBwm-?F_y4#;%_h|N
zA!5wW;4A+Q-_1WF?))Cxzk(>8k<R%i?kVMm7&-r2kp35TE&kpA4EqNE!nnr!+rJF^
zru8S4HxWnQ#~tAJ(4wC~7k`6x{{}51mj8^6c*@mZe&U$o6e9AQVE7$GnxCS_^DA)r
zdyxDsSkf8kd|%N%Kn(gJ_WmvQ{EjjH0~m)0lQsP3mjO}SzhT6BKzCO4jN%?*=?Cy>
z-@#bTk6_ck07q)KNK!7pJX@D~75$i-=v}`9hTlT0`5`F&9Qx+Z_xJ3(xu$<f`7ZYU
z0N45@<4bgTw#1RNN-V$ZuL0MRX-j)vbzSj1TKpCmeuP$imr<lv^Bn(1w*K;xTN9>~
z>li1yf&07Hai8*aoR9xjegbv>Lg0AKa990?;;Y#AefW-_gYB=GMt{dNN(v<(@~>ZZ
zd|Y3~jCH@^xb}kTY2|J7h9BU*;sdn#n`rZ=j3TxA=d$_bx*Cmx>Se_idehIKZ~ZF1
zllO7Hj}fsyh1z}uTA$&3*NnF{uP8~Op9p?~^!e+c-bG9QvIFeNnM__WAJU)HTm->e
zu;W)?$?t;Uhpf%tMeE3-W!rB`U1E=RO0{2c90aeUUvnQSd`DpT36%XkM)8{IIqiMb
z8;bX#cOv*BX5-{Dc!uOHNsC|hS9kts%7S&jaVz1p<`M|rgdM*G)x0S%{03S_%8|=2
zH?1bgpD?OkRBm84=Oqxl4Lg2K_)bWNWX+#)*Aj2(?x|l>z6~4x2CncucuC&ADSi<{
zKH^kiWy>4b&-+la{?FkrY5Nu1e8yw&5udWiM?N-k@2@2BzlmoacVQzR;aeqdwfzh|
zx@)i=ii7;#@N3gw&{a5`K4V`sZy1j0&N70}D(@&>M4$Kp&PqD`8uN&9`Q@coTiXl+
zni<tz#U{oRE@Pbk7Vc2!d~c(rUuCxZE%19b;VvxrO~reH(#g7c+a^mE8FF~YwgMHu
z2kt!M&zL{?h0s==t5|sqXXm5+kKr50+lYIADar2$o@Hzz+P#ieMmv8}QaeWV-)4UL
z$DqicJw&;yWGHjWxn$jM+SH%WoddzAlsA;O6)(W<US*c|9%Cq%Uv7Gp&1dS-jX{A2
z6h|>G$P>H^Rg((eh9wh2@Y6pJ3w{6v^8S|hkNn@rM;T&BFAY|h9zi^M6%@(ZDZ2d-
ziu(!L!<8#-d3dIH5#Q0<pvcEQ+WS++QVQT*&?~{*jpMP-U!>8r<LbBYw|`AIJNa<_
z>_6uMWdk`ADf9NdN$bWV`coiynGw9DxXV=d8Z7x8{M~Z-<z-aZn-T;1Va<$sRk^M>
zhEc66xHG<q(drlBTOY9Vz0X}U-_YODzNmgx`KIz+*ziX}#0o_iKJxSJSK^8F4BGe<
z`sNQ{6<>qR^L_x2DGK$*Ts%?Sz;*86J9-CuKW3E4(@MSG!e2IyNl8ty&hO<iY<SMG
z58u#joc&#VN8e-~jlRPlah>k|ys@;I<Yntx;vvJ)gwxvdn#<~^RL>}HDxXI*xzF1C
zmTZ2x85NEObBCc{H=$lot|<<o)#n6)x1o{yOoeX|Kf`U^J<ZFi*P+1o1usFK@dwNk
zlBJLe#g_sq>4hbqz%$tApk<0lq+2{{#OyzFLJpqzEMgWA?Y+;uE&rW}$H)5jBb@0t
z=qY2JzY&j-^qC;IfU{r2H$=TWzDMy1Jb7n%U;b#uv}?h(H|d~h({L=|Oc=pCiWd|w
z!MnUFn_pf=d2)SHo3U3ns+m=<C=a41b3!2aEWW`P&~{ScHS;ru=X7^9_f@Yk1yZ~h
zHN{&eS;)^Lnw^WOe&d+ua{@l$2E6{uXgiyIm9I40v^n^Z_MQU6oA{>q7#MMaD8Kvw
z%IDYrDllCw%K2$^Y^nycC)@Gdd;=|c8Vp~?H^hH`ot3?X!&#H5bB-13n)#4%3tsSS
zkl?fA31P?gW%J9;bUSM;t)?#hpmq`p+zT5%f;OFJ8hjS&xC=XenfMuRCEV4%q<)1d
z@IBaYC_0P&?Xzty&cIXuUaT-UD8%;XI5e3QlA<#?G&z`oHV73ShS$A{Co^~PcT2s(
z*Z(RoD`?00X?9#bXH3HE$#y*TU4vRLp=V3|GK#SNs@|fZobj|-=OV8_<^|7cFKDj7
zimxl5#oXMxihBy#{x7r4>9zVyov`3BC~#T1UvUUt@U%ei1_<7PI<A?ogWvPo7h%B<
zpul$o8y5RFym$G@j^?Lg23`kdsVqSYC*hO8k1RNcvltCdbq?mGbVDcm;PKDF+HUMP
ze)}8S*Z)R2KYc%#AMC`Ogmzqc84-uRpBre$OPs&Dr(`g93>G}^ScL*NOk1$w)4Fp(
zf;W`6;0fjO%grit)!3TMdknq05zQ28U9Q50H}S2V!BaYlB+nq0-Qlh!T{qs;-_gCm
zEcgxOTZ(rPhd*R-g#5~<=uJ>8+4*up3C(!=KZSV|hdIn&^C7qtb1CXXbAk&5X-e-X
zRJe|7O4o1Qx&C+C`Dt}*6&|D;6QIsY5FCalJjl-d6=>Y==_wt^8_k?bolRbb0uLk}
zh7BK2I1@qercyq?(iD%aF{zy_ST_y@E-BYw!+fpZ5JB)W-PGUKy}&G3P#|oWyx=zx
zJw<{$`B9S1LKsyiW(mv@KXLx4O=q&`QdBvZ;n)s4p218E@RfK)TmQT5{62gog_tX+
zOoGhm+(YOuQT5`}uw=idyR5%pBzuA^*iH&uH*La(PwIGrSD6*xcqo3=_J*WZQven`
z3I#5pA44{L1on0o7DpPqPWm%F3x0Q*1;3_zL-{uPF%*Zt9zigA^KF>CTMg6g<N4A2
z(*iD4X*L+F+Ef)Sw`qZ&oPY|YUgG@H$o0S5&R+}3r*kGXAm#&RJ7%y=L;V|2p}(fv
z-B&o0J)S<3f*4}kM+$@uA4OjfaYTDj!xNORzmn7{M_p2j$qx%2)=n@5?jsw97d#at
zcrEE^)3b=c&uj0&f?q`pAs_gjV8b5=H5jY?REMYnrp4x;HX=rrV9vZ<lW5Z08NmwJ
zco!5Z^%Cb#ZeRZ!cK#MN(JB|RR~s;4lNNKeqrW)`AAErGd%Mbei-vR2he_oXxDN_E
zY&>E(u0I_{@S*r6*I9ju9mZ~0@EB9z3T$|TdBIcg>zANGfgd8sJq^X+2ciO*4S&S$
zlfMlsj{Qo)<lR!5KSUnN%O>Nca!ERqIl-YxQ)c6hMtDN0SM2q_!ue@_a3jQCh}q;O
z1!kOP;H#>|W&5D-1#T(ri1TFf8OJ&MdHV(XMcYN&CAM9*UABg{E7o5sKh!?Zzht;;
zylr|e@mAuqiO-sEV0#AJ^(20KI_c>oFP=gk!*jKBn4fVG+a+w`d9GMP+cm4aEnfSn
zk`o$2D;VlvvPA&uP?`0{q=aNf&<){8z2dF^70%D+S15(PZJbsM8D=9O*D-=)+-Si{
z_QCYc)T1fKU8kI<lFwjG(Ang3j&qLlj3}65yJY`8?kGQ2zN>yi^NRMq?q0%O{qy=e
zhTDecjJL4eG~F~k%eH)bH=drY;hFdmJlj2qC%k9yocA2#c^=QqX(O)k=v|X|{)QE%
z;q!NMp%IHF(P*&foa%IFP<r_=KR-9t`d@Z_>LGbC9WW2;Jy1`L7IEc5-SxP1FV|l-
zQ?Qb=KWii7aQYFVm~zr}5+qLxB#9>R{11$=WA>!ttIBs&Z>V1*irN=-_jGp??k0ew
z{<a=0K{L<Qgjp9;cn-OSr{ae(yWl7|o?s+TgC)@<p1Z>j>JsOl()xXtK|*2HnM``C
zmM18!KZ*6fA$=FZbgi6D1shI99I4<5R!)@67p&y1WgpBs1cqB!4S6*6c*+T^OA9CY
zXN+g^tIt1FypLG<rs{$ERn5zq``VYl@da#mb<cxjUYgh5fq55`c#exZ8qCbt5GWoK
zIFcq$?;1a;@i{4eQtNxAj_>x2x~Gd5KyY8qf$Vjxg5((z#bZ20M)JS#)a-9@NBVvA
z%|23mRq-BU_`2#f^(!Jp?LF;1U0!-svd<R4EXWDW#8|=;abma*d)|Z=k3fw)$KBGO
z)cBlK`=!?RN*&+j9jcrtoh@1f!M(XyJ({(_7;e(4T&6{y;$QGIhgPrB?4eK4L;Fzi
zf%08};RBJP<|Qr0Uz`p0J(xc_41zOI;WB0htzmx4I#Z%Zakuz!6614H?U!2LD|LLJ
zcCc!!e7a;F1XuI-=I)0I57JuP%tIL>L((G6pZpyZ_;bV&n!WKc_jRc7eV(E64HX!w
zUWFF(GTp9PhtJlDxqHKy!!nK87)!8Zo*`NDA=vZguGud<d))CksrE~)@0B_}P~Yzv
zt(YvEDOnH*u3-i9fh@9QVu+~1bSTbWqxHW(Vg$d5d)2QgK7b{^qkId!-#1jRgCg#K
zQ)-f%?CrKL#1Wq09Ok8wC5sG6iMyshiSaq9_Dikrl{&t&zOQDuYP@{9bhdb*XbBoz
zB~M5yWR}b`+)AVQYc&6gpFi|N+~0p2_Xr=sj-f)uyI_czf<ACwR%vRrv)<lf>#%l%
zAS!|*8qY$7i%{Vz<_M83AKX2D6614H?U!2LD|LKlL$7zJdW;lUI#;|HPB3K2P$ED7
z>L2k`>{qbipI~g{Q`{YV3!d=nOoe30#4tCjB&`w_ObYD8N`wJ{;0&S(sgNw0819<>
zB*y2Y+Ap=fSL*oA#@^aN&q(D&`7~^Jo(SecgS;mUv*iE5T$F#s{EB~o0_mPRMDXLF
z3f}|6yzJt%3YRyz(cWV3KpgHxUuFdJ&?XT}<`7L55l@KWuJPl?Ph*bHNwr^UeXrE<
zosB(p17yJ!6YiNZ*l_V;;WmQIlK%>R|BgP)@9<=n?s+~#91%6h2=Xda<YpJAm#27~
z^^RtHo6T?SLL3nZP78Lt1ctlFFYfr9RQtv8mssn2rH=PEb=URR3|Egrfm5*IS<1R1
z5M0e;v4nZUzhb7|f4~F&BWC}Q0)LJtcs~jf{3f$w@`cbJ_*FaWV8N~SJy2lJHiEng
zca0y7pT-%VlWIRd{t|nAuhjA2SKsd)f(7#mWHt<f%gl;}SOS87#XQIVKp*g*5JUbB
zPq#m3HvB{Mr^yd~f>`nmvSThcyC}n*S_KPkaQL7=vf&=Yks-l~r<fhj@1Fi>{4~z^
zoK*Xz*7r&s?{Dg=?;{UbJ;p1LdNNFdB0<(CWc>aJ4@eeF3gm70C%7jL5hOd#1HZD=
zN>>dmm{(vYY?$H*PjCX!nCy5L^%r-1POAO<_(AOTy;8^fo4Xo%$$~ww;7am=WWx~z
zSuFWq%m)1v`p{&-{~#!kZ1~692<GO1Us?q`V4b7UE-J7aHXKQC7x~2<pOb1ozkZe0
z<wjrMD|NiUk1V)qTvQ<SWS9n5cq>MqkmvXB;71nx_ksd{CJ_9dV8!3!a&rnZOTn+2
zS#UFaV7tvPc)@UjyT*?nKdr$~YSi&LsrE~)@0B_p{1Abe2P6xI0@07*domFOi62Ga
zKcEluJ3)cuJ3d2ChT;fM@Y@2v@)USLX2DP(`Y~j~<OK(DM?$@s@!hjuKL0_?@j0pX
zORevfI^ORKPy{9qNWDN&fqYMf2o~@f{9nwG|4&2^@__#sRN&``!;u6@e<kT|@T(yU
zMlVKGU^l#Ah~O^jPaJ=aIzA`WeyR1nQpYoX_5I8Pj#PyyP_$thnSm7(zdz$1jy&K$
zgWqqM4<rTtjP(UY4GR1aK|Ib{#Na0Kf#}EhaX(C6P$al({G>A+$3?($TE+cD{GD>^
ze-v-jjMeuyq9_8@_28>|w1mp6P#Q9}b(o#S`JS@6%0^F<w;2^IeDzep0$Wq7SigcQ
zSZr`L#r2-XIuvAR@-?^ku;JB8Tf{5=Uyr?yS-|5l%ZJF$f1g)VQeWO!)m%eNT8IhN
z^&lqURD7*SExWFtwvM<4GWv3Q^1BPWN&;m8cV~HLWoMPY+V8<u<M;a077Itp`YU@q
z-QKR+Kz*R0v!S!Gv(eu~TSqh7d>y_HNq&!6z~eE?hsftwQdm(^?XIb;t*Z0XhgJ0u
z>w5V3x*iC~ca$e}2TY6BndFI-(e&ZWp`3xd{`|he-lE=;p3<JOZg+Qi7s^+x6wH)N
zxJN67ss=s%HGSUR+TOaJy6*b!`mTnqMr=)irhp{B$1LFSnB_w_x%EF#7p0`gT~b+A
zRqm;*siL|bbv3B!L3KSsRXysP5SriTZmP~|Hx2vEE7k@3jB_ewB5f>VG;26#D0eV#
zprF67uc$ZqWctCJm4f-=>C%bvv5L{E;p!pJV9h{HzqhZpudWvxV=2k+F$;Lm@XceC
z50S4ws!CB>>@GoN6sk#4Db)24>rsFs7ID@$AUuBusb!y0o!1`MA2RMYuUeMubICKV
z>D0-z@r<#|(d?1zk=$X&nbgg!{dp?|3&pdg(`A$8;}v6-BcM3!A&&iCY_$k$lKf}@
z^f3x}Q0MzG%7@73Csw5>1HlS+h#~5Fu!<ByT@M84pFn~46xUT3G$#|b3>&8XNqa5J
zwgtzWa~2e*(k9Z!Ge+&_P`+X#dkq8^i{?sZ7{iH*@ybz99PtoE#!-@A_yj)55AXay
zWct9J`=#EIo4*sO><TEOJ*rYrCUL3;P)U_hDuWd%c#7yEevZL}H@N4OPpd9!PV0{8
zH;wCwYhbuyTXf7P&pKyQW>Tl^=TW}mFcO~bV+2WsGiB56iE?5%RykTVQjHF&XISd*
zM+fS$3V4L3Z#`1^5c&F}x-wK31zBKOE_Sgd1<#NuQk5PrZjyhEo4F4#N%uAgUecV^
zozNdK95Nj=6T?;8l6{esXuse@;#ex6lCu|SP8W+7ilIW>`b-f+bmWO5lqkt>=L9~<
zL&85m{36o_hVPaBS#JJfr1BK2Qcym7KAXK*lOn{h1~)0c!Jxu7Ftzn1#VtngobFV@
zQNyM&#BjyFi1GoaP`=`DCKX@U3xZ2Ui^cOyg&~F$m1B}Eh&F*w@}STMqNNWE?UnxN
zVfh8CQt$+M6^1JGRO2S)ceo{?2_+BU1#j{MwWkw~>9>rVrj5h{(BVGl@M7{Al&?VX
zfGi{|4ibbNGZh9IO13~8K#rWiCwZ9o0si*+k;{j`e(5HH|9|M;(2a%o6B6_C8J$B_
zDL}B4a!gQt9!i<OsvKdM^`GDYzmK87w_(H2V*>S4s>_=5+B3|Kw~U8P>tMLgb}9L6
z%1Kr{U;{S1mPZd=mI`UuXCBdnEP0x2S!`#_fB6CAM=szIKK<ws%7;7@Ki-0o&zV)F
z;Qb&Y2o*AdU<iKyg5l{OVMzJ|M3I*iD088FN=*dM!j5?rf+6v9iRCLOZT7w}f_yY#
z3}bQ=cf_8+Cw?OCzdXNa=>y{)7ThUUe=L<T^MHI!0Hk!Hv>a4pi!><OaRr9Cs8Gmf
zSY$;>@?Yg%RNO{#ZIR&VgyV=NP@(OT<D8Qfpvgc171%HcvcX1PgNs5mnF;=(v3`g*
zfiKoEVegC6w;r*4h;03d{HUq|UZJ`uB0<59!S4@<AisbGQ<(-789?IbTgqn`!3)}R
zm{xsKe^h^j_#uK&@_S0%zRt>5>}3R(nH3AsB&?~?e~dqYFZwZI@ACX&r4J1IV2mFh
zQh(bN$f{C=2ttJz=Hd&5P|Cayp+F>gQlQKQBX~u_6Fj9qZo7>1?PtQuSFi#q6i2X$
z1ktC7@n?tL4h_IRQUMR*_ZaB|A5wqJ1MaL!5hCaYKPnXRD~u!XiP}(P05<$QmeY|2
z89{i$(^ld~<rjnkG?^4dsB{4F3lUtz%=-8XfTb4jhDb;K$>_tfwm1ujbv;;y6K4VN
zL+LLVgM&6KRs;wV1V1ViLKC&Vk0~;wz}I2J_Yg;(RbCeeB9?&P1;#H_zJf2lz(?ZM
z{1x7c(dhUKfTb4jMx_>d)1^}ugJ8@B#HgMUcLDH2^CJu96&P8S0tBf-6cXY55&XnN
zZQp<oq;#Js_M>=CAb3@CQG1@}hX}$7&`=RBA&Q8R7zA-w!Y|;6w*Xjj0dI&^w4cg3
zn-VjdV=f>rqUtGe768k>|78IL_ABrUQy@|-vKJ%B`28~?2tU>GTUZeD9-_!A=*gg5
z68bWvK@bGLi<nrA0zPEHX<JmnhmRus{Rp#S5ER-RdjYWI0^VrULhoVuFlW?bLU@b?
z#F$>lVcvbr1;BFoiFHwkUkn8j#4^TDEEGZ({1H<ir5SsX#Swv^zz-3GFF+$GP$-}x
z#1UvP*wR=FfF&332G0=n2ey2eOE7aW77(-Pg&bB!#8?3A{7~@=JP!IjqGl)o7N0ta
zQeXwsQfc17U4?X7tN)R^&{Zif{nh_QG&<S>UC9N!5og%(3qSqMtkl{-Fv<dAHocHi
z2*g+boPQdLfWC%U`BPXNrK=JXV2SCfgalZLn@EWuoz_ZrqAabv^jH7At-%=>E{<FP
z96tURbph|TGi>{npZ=D@nRFTl2<})wEKDz?C>C`AaM@n#dEIT*>sU4=rUIgLN<@-R
zfF&fXvW=<bq_z5kT>a@EDCMNT`t!Eta#(s1xd3?k_}}*ZJ5CV!MsWEsXE3UrAh>M-
zad3KJj0M1zbFNL(MeR-1eZ^ZyjPWsgz2C?96D1$uliBd;s(zQ;m9VOmPHXi$x%><-
zsozydr@#7RWNX=PkqdNpj{imOk8*<Ww}Q)uxkPn>S?vPBhy}#K<-Z(C_e5C$?3u{e
zXFYDXq`jefLHQcSOemcVr3m=}C{h9fApw@F)i|YIS4gF``n6ns#=Dv~lp)Q>N`Lil
z;cX569KJv|+W24i{@_8;e+Z2~+hD^wi;8M+g)JZsn_fsZykh~dcQj`)dBc2Ce_8X4
z>UrgT1)tc0l8EuiFodKrDUFGP+GW+2B7Rz{&*kzn-PXRMdPgCZ{_6M9Ta6xtxBxi%
z_+My$j1!1&i{sB&KFpbw27}hBO2q=f-~wXWFHA3_sz*B(0M`!Z&!+6NY#GlaT-IDy
z-B#XHh{>BMiP*QHL|RDy<FvY@o`fm&er5c$RzH=?FY!6uef1m4_Z9KeU;VS#&K>`X
z3v^?R|B3r|92ofqzdTh~KFlR*^d_yH5fl~>6F;i6z?T1Vs5!E20dU=5(Ny}fecgP_
za8`Fk^R()g@&%|+Ox{E(F<1hubg#v47}YGQHWcyFT76G8zgr3SG}7s>ejC}^h~M}H
zy0ORqgay12$L#oJ+wx)9aDrLmP*Qy=wtzTd`7ejEQQH;(*Y}r<XD&F`EQd`e^ye5s
zB+gO3sGu}#Z{zM&NX(L6ZEG_P=w{S=m79v#X|293o8PngyRhIlmG8lZ#pHH7(_j5&
zM}x!Ppar_|#{bv?-tYr<{!HU9Vata(v&yJXNYW%Tf>=PjZF(V9-HKQM+|XAxmNT2O
zVn3L)Wjv)nue%BjB83?2_`X6+-UNmjRd!!ukA6(Es6L=Ps)&`=>Kk(TnOXWPHPT-x
zKTz<Vf>IA{Pk;4yJ6jy_P9peG1dg+SH{$a<ej$Eh`ERy-7&dG$>MUxfGMyGm@0eb=
zZ2@p&uX`kKI&Co-OP&*t8crvi*Iv~;rM>}A_ySZ&DSqC<2*7(t%hY5J7>1$1cxkP^
zDx2R8!yVm=npdH~c<HbHW=Dg!zp)GBe^kCYxO|v1D@;0gK`T!%q)%abA%~({VGDp8
zdn<<XCo|?<E4G74uwf9qzz9C0LgE~z!dGF*Z)H>@Hzai!k=9DHpk7lRLVCn#X|3LU
z2!6rzSBN5fk~}`*QzAey{greAy6rr_;NP2iDu)UuvSw449c$JN^Oo_X{)|o}h{QQc
zB*20SA7qw0>n&}jUj3++rL|ISDWavddQ&#PXN=DwebasQYfOP3zzar8fAxzm7{^0j
zuBp3fuxKm?7VKPw0uPyv!V8|&T@*BU3!aed_&%kzvLdaOVNf>-1+uhOh$Gw6TFFi{
zCZ@mAu=H1mBGJ=d{p_I|^aUT#)LlJLGMYP;G3SB;EgR-d({b4FS=}W@@Ve?I%KO5O
zk(SA8^O=KbtyKGA!#mSjy)0LMNZ)iz|2#ZkF#Q#?;qB?KwkHt#g6T+KuDRROUpkyO
znK_fXl)P#?kaQ?<%XmV6I^i4$UPC0gj%a*a`8+M_wKXN}fd!A~rZn@2A_viv*`C(w
z1^N7tz6t5Cm<1#K6`}~)Fr`rpC2#r>;_;XK)!gOjD;p{p&z??OaIV<bEbHdOuwn9o
zA%ZuQ&t)O4m7S%vf(5g*R)``<YXvVDnHWrFqMcCsD_Af~e}yQrGyPSx1Y%z*qRm}3
zz3!pHv7D*&xs+uGDR3ik({#*miU?{&4c?$-(T-qREA13~AktdFhL7NWB$U?b=0oW3
zStOto(qE~B^jEOq$n;mz2{@lHO<$L{w|uB*1RiiEbup+w*f0p5)t%Q~=B+pjX|0m$
zEG>zCW3PS~y%;1`QzNYvY&blv)pgnYSo$lLzUjVNNPmSW!rO2t{nhuLFdoUv`vTsc
z^1<SfyveNTw0Xo3C=gMEw_%DS0zu;Eth4!&I!xWLV5GHz0)@0z@PcAmtEV1@pYb_F
zkbByfm<5XpBpVK;zxq_N0Z;e?Rr!UOTDtQk7D5;Ih;>h-{^bIi!o+-AUUHr*FC{N6
zFCAM3+cNVq^Rj}Q#cR)ZvHcmr_p|fFEhkUho-iKC%hwgU^NWKO<fD<~2}*Gj_v2n~
zQ!%FQN@z7UCe|f+E!DOvdxfJSx!mb?mbuDON>fTxOHxZLE0%6+liizK<*abIQ_50H
z(@N5c)3Id~(Uw`rZ;}mo!XIcXt1K#`>gQBJKBD%ysJw^dheGqP=2hjCdQjVw;5W3H
ze2GowhNOB+t<`I*v3cy(j;iD;%Q4HWrQh0tGV~3O+T<E%wX4!qky4&gp6X65ODjt&
zO)E_=NiTWAcqA|1ROu-#FD@%W{d1~QOAOf-Ui+K~iu#iJ7f+p7p?Pjyv8<X>k7@>V
zJ^Fy5!`NnOHTld<NsUPjNp+Sw%W2D+Wy&&W4OsVpVUxWc6unLlI99nTQ!22Pr?^wy
zk_~vmA87V?%d4o~B@q;>pNA;2+ULYlikr9}v)6<|^W2tVzj9GEqaM=?>-u%Q`YuDK
zVUMXD99t57mUEUvmSxMN1qI}-owjyc3m7&!>K%2USmX4#h$BcoVLXzT_jzk8tKAhu
zaC-&$u<Gae`63^wfAJhXSZE&UK{ph8m5b^b^@L_rJ19^L82rW#%X!NYC~(0tVL<_T
zED!FmwcA_l&GsfogFumSlx)Bg{y<A@eRWL*>Yp=$sQD|_osOt}ULc>}wnFoWCI^(O
z$_4eTdQvl{8PV|+yDb+j$1UrKB6Ex&3dlo+9X7t`G%1m%So?(WNM63BuEFE2tfuPS
z0>P*i<O`&9C+^4kk8OqKxuc48#hP-3F`UwjLy5z>LGU|iISd6ZS!OKb&>)pt4OsoQ
zJpw~gA}A(d5ZH?Zr!AP}*d}c4n2aKBMxRSZxgeX_Cw6{D3cxJRJBY0pxl*H@U)PhA
zL>XfnV_c!7p`ix#&nr;>oUeCT%qrFf>rNx4U@MT%FG`_#%&$A7K%sf%vT7bWoYG8a
z#w?dCrz~5RgO*j;@H7Yt1>~(LARo?<rk>E_#5$n`#4$vvjpL3fi*3q6Ota)a;RDC;
z&q(oW#lzkvu<O8C={pENmUv?JWV=R_flQztc7-O4_KTmQzk}6}_fYj|R=t}vh-#r?
z_47~#c~M_d|6=_|)I#%!ChLeNt4b6}Cx+m6#&XoM0R=8w<^_U$`SSp?WGa5%YWGq5
z705VRfX89ff(Ep<8Ep_-f!T4#xIClY$>pPsRZt3jrcwA*ixfU6uL)-_!I72G&o3kh
z)U%wObcGr`53U!k(2V-=O|^AYyOt`~Qq4=A;I``Lg@sbw#Qj*m5TnpMheGpE;Tq4-
za@leg9`LZ`fMqXicn*CT(x6a49xC)hhhQiIuwn{XA(FU?f4|5GGoN0@?8n>$vsp<~
z2a4HeJd7i?QtCAb?UitPFpP2jumpjWIW`-AMJb*KSA!#cSFC<KYS((G{yAT_maiDf
zYcLp3L_Sjgf?v!+^H9nrQ{g`43izF~oPY(dLxC%nMf9epEMtryUwE2S2yYl_e<B`T
z(_ddE{Pnd=X&52=5J}DZxg4#{q_eQk6o5!M^h9lxdiBC`Q9uMAi*f#K2?C9nf{>1<
z=|s_uYf$}eRR5*Qp-h4Bf@N+jZ^NrNSV0~=Ln&_JezyK2PN8|&F;gMqcM=wSNKoKB
z;>aW;7%U)9DhzLb_}^ZRR(psKtrfe6RO)^%N8>OU6V*0YS{AIi2rVoRKRoKse)zpg
zXb+i+k(oHeCV2iG2?CW!Vr*yE$OA>8{y8j|DX;=@I7CqNgixW#N9tej`&Z0;i&JPG
zcD$+B0KfByAjd45Oo6LV;M@*^-PW$iM(dG4I0dcEV|3h5B4#*|g<V0a`xY)o?MyJ6
zbY>;BFO#Vx^7x&<MXvvia{j0Z0@Yw>LptaTksmA=3an(+yCVtmp0HSoo48+C{}H#)
zJlV133izFd2i!soAs<LKJcl?E(jc?r?JZ8gB&cMLrdJR{+#Y|<Ecg!2IgY@Qqju>m
z21BAI30fkCQZJg{QU^=S#^?3;aFp|t1;2(kyunSN+f)fB>%cr1g%%90T(EvT#Srp=
zV(n?silM<$EY=eFNd3#!f5<2_4}KRAfluHr<S=?MYv{);p(isNMvxVd-`Rd0Y|Rc8
zW}qSY#P2iKzN_4R1k+kBN0p+r>P=9g$^z0-FX9)u{&(B?#ra)P69nprUt=SAK#C!}
z0;3Qtk<U+Np?QHHdB9Dkz<o@C^DK@`Fh9r&$VY2G|IIt0Lg<Tc-#aL4a|$-x4VRds
zO4ZsE%tn2p#;k~TeE56O*8grhKY75I2?B-s@vQziESOgy-Hp(yz>o$@q;x0l$Bc7X
zh2|N*v$zL21`9sK6o@Pdf)|{^Jsrhj5Daf?=w~f_y#;2JLj0(GUqd;ft0<Q;jW|-x
zWvfy(j)WwG(WDC<6aA;?>whE8{|l@p{U&Cy+(F@pO_W?1f)#t=I<gVA&5`xbQN5dd
z;Ew9uB@e?-R-t+DyNDj(DR@A#;C1*wQXttddBGsT=&h!^5(7M;o%qoV=#MeS<R+d3
z?&C(8kIGi2X_9poJ@Wm>e}&H<3Sa*lJU>4_DQ1E|q5e75kLN9zSD<LaVt@K!_{l0X
z&-k4~1UV73;59@MQXttd5gaEC#%{d^{vN#!e*W4gm}z$p3OoWE9)wG);6PCA)JoKE
z5X(12TmKt8zgS=(YJ$MxvuPKcmy$0#F59orW~1$@xLL2V4avUjE3mZF=<}Y3ZZ6{P
z>k@h({B{K_f*bdu@+s|C^w$j6jMq%p64_?HCT>aB#LaR|-WHo5O{Yb~<mX>~7xT7X
zMLDN)C>u3{!GLCT{j!zmm{BkBBK^_)t7z+gh4cRs5$gL;_&e|sH!x#m3#EG|Q9h-e
zCZ^V`6mDc6$vBaEGUc@Etn-}nT=Mzk3&|H87ae@zk4q^2fz5@o9*B~MU~R|YwN4>+
zo&`<fc^;9JHsX2__Vgk5K>3pTuI9G(Io+*<XZ6qOZy0VEo-saSyl%X1qU~wZ)AHz9
zuElS@n&<Z!#%w>rDzUrhWgJJ@s0ozrX{1gG=I}|pcz&_g|FZK_g!&<7MNmeEJKR@L
zHfjT<dnR~-_0wf51qX6AvbHjgrXNqGvK6ON&bZDvg~A`n=dgijT9X}RJz!s$p#Xoi
z1*<y_+ar>v!IEe)p16BMp%w03#T&|3RQJ{QG<UVnYwzf8>z+%vm2gXc6Wg-}Y`en`
zaQFm>KY)jSSJ1~{*zgGANIh=#vL*0AL=x+NB@zTSj#ti=tQ4%}t!E#~+RQkTel+cP
z8mW=CQ?4`6BqN#bP42LvtOxx23K$+lpM$4(OyJ0qJdNnf3a!AN-T}kc7{wPg#1SNQ
zM3PuOw_E(A#^<E?Nv-ddI=*SdGv!_=CW265_Myzp%p)ina5U{$kRmDabVilaXJ=(S
z5FeM&+g(G%T!%exLW@VBMxNsdj9&!{t#I!v-d4P+e4u<)^)j^hlIBIt3)&ZmqxSjT
z;wLpeC)Iwb^}SNZHxGIzDrZX<L2zH*{+xrbWRy_J+{$3ZKSC5UD_jkZJvLU>1NW&5
zu;jf0LsDXpBFn!KEVKd@zNetV0<SAyQ@s+Tcu#Y0*Z4_{&q=jkYJIQN@xFoDv8t)^
zxzfd=)q=hG`%yv#mW;KyjG<7x0$R*+r_?6<?434N)&u>Ed8lwjU`W<{h^NT%M?@4_
z;XXiVMxNn=5XD{NCow)J)qbh<y;8^f`sznKlNB>%^CcmI6i<jDtQj7W6#&U9ORaH+
zmGuC@IrP=Yl6f5-gf)XA8$Z}qXay>KUtstKtQmz?RIltBKZ)@<srE~)@0B{frMF?I
zh7>qkx=_3X4X);sCp?gIFgt=_R%u$btKLCH6@;=L=;6*ng^N((D()S~AHpLB@87l+
zT7e4RQ@o>Kg;rqA4|a{8#Q2<4`=!?RN*xb=gWl0<QXp(tAh?ea6fK!qb2cmML4{>t
z!$MgP5S$TINR~_t_p|lqQ46gIDijK>>>59b@j1!%WAqB;ePgcgl{~(uaiDg@Gf_EJ
z&TN<nBKH9A2}y^73Q<;}BE{=$WC{$H^}uKy5nO-@mzg)*2Y!FT9E2E!R-i(%WH8({
zelZJp@S`YF?U!2LD|tNl)eXafCn{##vt^X{eyK2$;6d~Z4~3WYuv0Q;p{xf8P78Lt
z1cs~N7q`#~Q{j7xcXy4S#Q2<K`|<pZmG_OYzE|@2?xy~_p_<WZUV+q;5ebT(kW{#y
zU7S&#TJ5S!ZgQ})9_YzL5)@R($tbi!D%?eWamVMR+Ap=fSL*oI?xw!_LGOqM#RH%~
zc)=1PxXd)TnlF0711L+u%6d3iSr1m4Cs@`48st6Uyk!CWWE5KAzPgM2;*QTrwVy^0
z;;rwMI=;25skfdypr}CJ3x){p&7(fy0alixG9|362W&W4)&t|6Q@kC6pR7VFyT(sq
zd`_zU#4q0ZUa8|-yPA6&`r!d7hD=mWQxPQeWWqJb$C9E<R@MU++(?CGNP+0d1j~Ae
z1Q|aWg;sWzALZJ5!~z~t>w9JJ>uH1sgasppR8Ara^F5hxg7Ac)vL3KtR#X8|gfDI+
zl=a{Vaxx38>>|Io<8xB&ms;N|b$n}}nfQ?hBn##hDB3XYiUmKoO8ih5N+|0gD6kVY
z99Gr?BgT^gKgmKXyUH)l_?%SxrPlXK9p4)8Q3NIr$XhV4K)xqK8eE~0g!#k|Wj#<B
z3S~WD!9fMWhQrEwSjHJY8HHAMmEX_tgy%c3VC4M881yBKp&r0<o-vF=w@bC3#{c;B
z=h4^qN*&Mm^`QU=c|fvYQGt9<1_X;3LE@L;POD7uxN2d+vC4XkfuF2GE4#+eCDp?C
z`-|i6mfS11{>SvSp0&VC=XmF^f6(9W@ALQid;H!0F17{yv~|)pc&&J-s@of=M?H!r
zKh>e2Dipqsmf*(M33=gK&rqPdGtfy#`TZTSHpvE5agxs{<DFxK&3uXXiTL~E=3l#!
zzFIt4F<d=>@(3u5zzR$RnmU_0oBg!;#7*DnY|m~hY%6UmZ>wsnZmVhYw$--Pwbf&5
zz}CpNrnWiHNaH|rudk=2yQQnOt2NNtiA~Iw9Qrr@Uyr?mxP4Gcek$e%u3@MjPdOg>
zC70i8n$t;ZuKCQ#+_8e;;=$5>cV9(cWp7olr^nM>1HpL%wVkz{b^1<gTS`l2OI}N1
zOL0qCOL=QWYgKD?n+F@Q<Z0HmEtE}oMrsG^2O9gEdYe$z#)l0oyBW(6P05B$<5^K2
zrdU4W>mH^pG(N@`Ah-U<yNai@#}fD3mYlO`lbI7aqj|#xgGB?y{iS_neeS;U-ilt(
z?5WZRlG+?CsV$kl9ACbV7?!k@`O2|X_^Mi}TB=)#V@=y)$#liIXT&>PH&EZ-(BIhC
z)YH_>ZzvHIVkwDb4g#vjRG~+Fy~aI*P*e8EUw~ZweaPKbo!1>P9k8t07o0PxQ|S{~
z<Jlv5L-~UR1BKvN(pQSjt?x?QV{LV|r1~;_*}mLnF!U9HVJRrO8O2IprLW4jR6J8Y
zSp|YM!?lBTDD=T7LWv?pfu$st1#Sw@+T57T@kp;X@X+%Z0(IUa9nB&CZilS=?cYEG
z&abG?By1TEnD<(jP(p!9c}!+ZWR0O5%23{5{s3_-0>2JRo1@j$lIF|sWkH2`&H2qd
zLq@TMr?^x!S2kTSQ8fmFBi^C90b*FkD>0m+WV4nqd$%7ZSbzyJkMMd64?QpAac&Fc
z>3oEjTz=og6u4)V7f}*r69h3;1qD%Nozp2`m_Cs?o`te9tUQmt+tiWNZfi|$VFa7A
zm<scn3z`d?i+p6wP$IVFqWRJpDpXNDN|oolf(myiQL;%(99i%lM3F4yWqIh=+nBcS
z45k!}VB!{LBR%v>F27Ikc>j*#n(CbPM8aX?K@>z;wW2VOV}=-}f?>u)<`^g%x=ns_
zyA^ARoGmG&K}Il#ESVTK6*U(#pUC*Rrz(O3Yes5=D&#4$$RgRGWw@^it_KP%#e_D=
z`9j;z_cjs)-Npj?WrU&fM{ojUy`j1b{BNP&L+JN!WBSKE#SP^p^%?E4ghNEo%*yiE
zXOnq`Oo<tW9;4s9$08_@CrF+U1eqlZ41LQ*3sk6rFO|Yd%`g>;cFbEdeB$=@?)ZHf
zvkj)W0o-ZSz=mBk6#p=<cQA451x!3TgfF25Gg@sAb1*(Dm*00#)aX9<obsCLyyled
zC`v;eH0?L<g9;aIa}I$atQq_|6Jf!mz+|6G&|qeBHrX*_SO6UsG%cfC%52$;dkQx^
z6BI}IO^hZyKm1>^ImB-U<`;kh%Q>1`OVe;=zutjey~JJTPC|kG2;_MW=|<)9<0m-X
zQCx=t&+-HfAegw1ibJ7H*(?}N!<HF8Kk>7nALE1<Ol4M_*+c|Mg}Ke-4T;|z75QKa
zWN}0w7#2<B@uLVr_quItR%0e4^biE!#ZB|e+%4_`CIwA#oizCp^9tp^<nsFzQ#@aX
z(r-e67huCDbX)ojMv&}yg;dBG^88@IZK47Ng1(HVtS}W)i2y_pC~%S#2o170!ZQqu
zCXy{#M(~`2@zFvUDu!18FKERyaG9@nF?s)0Osl$zUd#$NjMWU~h{Uv-LEcL~zpq2-
zFLJk`z{{$0n$z0jJVE1r69`f?fePpBh8{y8(QocZB7RU{a<fa+V2~iGFvyR3F+4#&
z5^q!C|JUA|xVLqk2i_Mu0fGPtZUA=@;J&XS36PSwXr*?E+C^%$Bx<o1%aUcwmb}Q0
zcgI_tCeG3(Q)iMk>3nUzX{XO*rk$oUNxvlRG-;dmZ~1=jxws%eg1C?<#}Xsl=aEQW
z+yk87ec$t*d(OL<_F@Kp2N0dD_&$%Y6yHMFLJ$$k@~K$(eS}=Ri1X?x27>i?lQ-Ri
zA<CAYIM|5;;Tx7$thcPshz>(=PGm6mBm|F%H9-o6pLa|j0U^jdrbLZ`N{fY{(&Or=
zlm}22WL^S0b{cjhNeZ*zCl+`bzuk?55)Gk%jm%{5OwNv=@cS6Kc!eOQ^GM)!8tEl2
zCYYrm%AVhM3Dx_6JN2sN4ib3X@s#tj>tfDXgJ2#6kwW-Q!cT1QNL&J=rO5<`841y2
z;ioK!jDdy(b|mbGA%*7gzcUQ{2N9f|Xz&CQ*pC86P{AUU&}m8^r$Mp6SVlDXG7>10
z@ewAIb=(4v`Q=&L-yf1*?i)zDXz<G-fl7xjyDlUV6g?Jxxr(1?FcE-AprONY85Dv;
zIzO=>#w<o31$iDxIhc`jLS~vV?jV|a7+S_LgSgBv^_}A1V;rK%i_e=dXe+WbBe}a(
z`-c{{&|P2)GcxQ1>;Wu&>L(o0X0nw(aR62Yp|m&R5_kh0zKR_=pCBkBVS=B`VbmDN
zJSJ8W9|h+L>im=ifuNd4$L&ZeLAvjP;3PAYJ@i2+9*XxvXEjAA>jrghZ`$)F{Pt=k
zb~Ju1Q3<=$T{@8Z2{TNIFv&5*a!2AD$afHu9NF{xV_XNs*1x5_W_d|TAUez_h$YTB
z6W8JsUihtJ1~}#$^D7OGOJKA#m0(<d<0D{v9-YRHbgS-5g^mI2>n14fq8B<T>FGhF
zZUX_4eLTTUq&;uK?+%RxEK15pF&<l>Bpef;@G>8O6L20zWr*p`ChXKsq*dOrpW{Hb
z&u`*j{B6dH_X!htow9hh83S*i!;FGri4z13elh~0!6Jb&lNKG0kAk`kQrd&@FTqcr
zN9z)Z9hslY#Jg`{g{qYq@*4WM8H&=&J*pSS_`i?AzIuuCCj8#Tp?+NE%s$+Rx{>8E
z?#I;pC&=o;79nWGD7SDIN703i{An8;`3~9h`wYkWud|Bqj?!R}z?-(~N{6u{SYllU
z6+dP$ir;FnAxNOiV?>9O2<rR{3zC>eCkPs&U{d#sunV0qT#ckjAGcyGcQWqngK9Mk
zc<Ik4HIVMd4+*<D3YL7$>VZ%rPl3ZqmN#&p_=JymI+ozrYFVY0oZPM4ttk2~6nRHe
ztYmG$xPQVonYYf%k-J_$;E-}$s2FZ;?5-LJK>-h#)dgz`*5t4CkNL;_<Gyj<L>`Bi
zBlW7Ke~b0h4F+RHv?Hw~wJ_;bNA4uYy4-asea>fQ(o7Erk%BQs^^MAQ?1iVk{+I52
zN&}03`3m}U6>;tWjy;EfrW&No_=HJqgFM~cz;r}G$74+nnqG&6my{Or9VWX5Yled(
zMWX?|AB<ah;~YYRe&~ImAM3wl+wWf!?5S>vM4D>aDmu%%gT3s;4HT~`8Y~<N40A*a
zV)?P9iKU*|RNviN-&NIHHc&c<S79^|D;UWi&0p<b?O)@QW36wkNb!Om`;ZYyg&^O5
z1Syzc3{PJFTYP>&0}FrsDkIYK+6it(GX$M3^jQA{VHnfaNcpfA39OSv?rK@yvGKNY
z^&NVcYRCEp>xZhMp;*a?E>%c_$b)dyk2uYbv~$j#dE<e>U{^&;O=CFRP}5Y^Qqfl4
z5$a@5v!|rDq>r89{!~g!|F%66ZR}`iXs_+8>?!LF_H!o&*%jqJ6mTE76GC&;pU8iZ
z=(iUMs9`kkK#zps!u7v}=MxvE|KSf9k?v}jFb5;dn)5Z6gbo&is(~0V2+7lltZ#{#
zl3MfX*;w9D#oI|~F)-9JR5w&PTox@IDIOu?#b`j+DWSMlILg0yV_(;9SdY54=T78D
zi~53{<*ijsHTBe_u5GMts%)ufEpH1!GSsy+vD7oW!(;XRjqS~iZFL>hofX|>y`jER
zp(ygKB)UM6Ow(%#v#&$~T6oszSF`8T^}p2fX&OlW)7N<Vc!Ot;6AX5%)apx)gOM<R
z6xMN<+Ku(CISA{k3k5O~x<DP`J~j*vwGY(~)eKig%c4PD3ZWRlZZa~h5n0rw@msP=
zYd_%Hnmg$q4Gb3d1Ut*xDx0etY9mxuu5GAptWq3PDJ}imba;IvTHn*yMoslddu>Nm
zS4DSuPpCI&P>gd-?rY*_x9VRlqkS9v7O(#$pHEzr{s&pVdPlp>>cK&DnBZZ+xa~GK
zf>w%*MdbW@^!@65XeS@egN+SkCSAhNaOY6dP#A(0(X!zn7P5G>NHVXC1V&``p(w)f
zJFHPWWj*fL>)Mh#=^G_=Uw=tYu%oQ4qPeoMx}h#oR~xRauW4ACSn8R5;Z5Pyk%5M;
z##SnVN80MzYlLA}c~4nSP$)7Qh%~0}HAA^1LZE+&-{SSZ`SVLOVEp-OYEJtK8hR3A
zu5SYfL2QZeDU<cB_j$V0Lw`48M+oi2%%}w(K4o~gd$?t&ez=ZNPzYjQS+z@Is3_*I
z`A^)XXDxHq8OI*il!r|=e=I=azLM@>N2s;DrLw82p++)Urczq^xBcNQ;c+Ceub~ry
zO)aExL<*}rDurQpSx*RxFkJfiR}vMJ@SH2_SLxROjPnMlCH@yaA27l_$Lhgh%y|@%
zu0#esc880Rut3%qK19P`V4wH|Q_Xdl6+K%5DG;c}FdXd}ZX0SG4iD7|!I&Z_QaEBT
zEF39-;b+{1JKAN-N&7*^F4txj3D){Y3Wfp$q?_v!hGngl&6N#ROA|{yb0EAeye={t
zSykWD(9zgR<v0ko)ru6RFkGVB@@KJ?MT%eg^*{Z5h6d!6&gV@u_$m@O4WCt5;tC;X
zbGj@ZeSPa~0#|Pke13!}=Qx&F0uzf7P@5Kw_D0)>nw13BBnSp|DTE<U4`NkDK4YA{
zt6fI|kJ|S+cjRnV8%3l3;lQc@nrtv+|0BEOKzK)ZLwGIQA^r8;5G3PFOG9g<wXVIk
zz1m>dRhBscj--0XwEidOO<p>mcNCv<iq9B^y^$%3U(2;Qow7djF>}CI)%wgy9@(bQ
za#?1S@cle38tWVGM1vdHAFE|$sZwMx8XAV6NTJA~Jb{h=XXN~Ko@Fjsp0FKZ6pRzh
zTkVe`h5e$*ieYy2JnIjJcQOKwM@Axp=x|qKyH2nLf+B@5tW<Jn(tRiVGOhoau3+hW
zjP<LN3<aB+7<V%{DN_XX^{p?{_d5h<8==!9%w?J(SPH-X;jU=gP!oIHb?ERsK}JGd
z3S~50{eAAkTS)meB=ETXkYn1l9T{Bj9rrR4>QY#QCYNN-Z&!FKJQ-fY7&uVhD+DF$
zQ47gfTf<0UjjqY&`6nO9`k%pPj@z*d9hRz&Qc=pU)(1afgnSbTy~#ZB6n1($?})-!
zQb@!`28O$%ZNtq-U?f^QRAUf~g~XN^QW*O_tEcZODZglW!a8f;@7V3y7AMF^h!hS6
z`UCyN+4I{S-WpyX9*e|~z<x%-&QyX*3Tw>ynF#3TeOmbS;M>zG5~%AiE9@1b#fl($
ztfWx(FW*<Az*WmR>q*-Y`#$W5BA7ejUF#e5N0k(2%kNNl4;Ex&cp|bI30#E^%P80;
zGRS<UUPeMCg`ao-m)m}9bQKn)6Mnh`*2yTS<}!+)J`y5@vVW<E|39sX1Rh0)_d0gE
zwu%f2K}NzDQkXrz>F{<&K+)h>WC#gFhm#315~9a#%kAIieZAs0+=&LaFpEI~bsbJ2
zD0*z{U%te%&SjB6>x}(?V_FEhH$jm33?t!)tlwvK912fkfv1!PkLnVL4yO{7k?`~G
z|8m=pjSgagyU^eko<n2|tV>H!53qQLIm|8Q>K84it;g&~7zOt@cjyFVM#!8nJ9?h=
zhr)ZYz+0HZOvEK{puUfp45Q$@7Nf_@?ce8p{fOe%9qovT28#q@MU)OJORSHBjD&x}
z{-wCLzsgMfI<uHLN-v<pjDmYy+Y<!GWc@y?<4|~Cco+OOqQPsK#YB}A>17m55LDLq
z^X~t0+lSvsKNh4z*I<!ArNc4`CKJ^6f5pzf$Sm+FlIotZ9=FZ#G%_ugI8Ja~*8C1D
ze%lxU*D(uRt;Rr+Kp6#-2`;yPpZE16qeG+cV+3rEi3T@{cTYlx=L!BP`<JqM|0X-l
zx0p#Hfh03MMiGz$l(N7Q#|dU9eV+BgZ#NcXDz3pJeC(ji1Z5@z!8TosANu_FiXYFw
zU82EaLktPjby&9}XtC`7eu@Qtn-TB@o&~Qc2|U3}@Q{px5ZtN=-VeV?W`Vi{t`a*e
zqo7z~Mey^MpXIjyJhQs5v$OVH_Sd9P==UkO`UAr8{)CiBe@faXIex_P7g-*)+67iK
zKBCQU#mk_4rds{JYFXOVTRz9C+Bf-BIV+#1f0lgGk4Tne9RFpx-TU0H-*11?@;p}I
z9iCC7VCbjxTZ#~UhrWH6J@Y^2_>3c4ex8b9+kT!CZ?NY0I<3CX)3+)B$^6@BS=yfU
z-LSk0L%rOve4>2TXX+jrcTu=L?60l<S=V{n(=dFQ{=Q9*-bZ>qf#lce@i$eU|B#)_
ztob!J`qwyjT93j|FEV<UcHhEsOS>O)kA&vewb`PJuA8=(EN^h;L;gyMP4#Ji2ub74
z$$fj+UmHuO^NzXZ>{qNeEVnE#B8k%5cNE7D>90=m)2#WqyQ_jddE>6__5;@AmN|TK
zu4Aunb9aoE-^GXFLv1!V=e_ED&i119HOreY{2=MGzKI5Wn|mf&@UXwuSMCTL^d8SS
z>v+=kwDqR-mgNo>_Dig|3PtJb`}FOj?D-9Zs#^ksxf8Cfj=k0y%SjksV#a*~tNj8x
zeOGaOQ=2V6?SIn!wBtG39qV0-A))#mGxS}g|6zYk)^07C&Oe-cGUtr*qW!Avx?*_S
zBJ!eBd|OFlw)*QHE{#NjUHQ@6N!J#~9{WM-QBsc2u|IQ_S<f?y;)~p&*~%vZ=e<{Q
zZa8j1@D-%+4Mq=LHa|k2KZ)!4!_IFaGF7&#aKCRx5j^j>V!vj)VSN@kyloL_c@2Gj
zgIQnJ{GvroH5DyIeg07hPC0hj_t}o1$&@ODp_JMaeZB?1Q$=&WOB4-%27<S(FI!)=
zypAU83_n1_Wvu+<VaKP9H*TujR<gI?pzmnzshl&e^NvgQr_khQ6vG!RFIrwf5?{-n
z-$-FwMNMO{Gcf2I^-Q`pJ9jwt+7D4Q;AEVk6nuPIo2@!oayI{>_X<VBZwf)%OV+zc
z;p>*S;tb!1;YX~bZ^J*}ZDnJ=idFjqEREP7<7g||%gV5BgXFxNO+IgS1K!Y%)BbDd
z*oR2`;`X}>W*EEQQ#RFT-}r3#f4q4^&6d#aqJ8;?@{Z-6a-VUXb3W;~iWFYg8CqYk
z+(DbOmcP{no#o+hd24Y`!BE~B&t%SKitSC?_alWzVR#x{zQBk$TYaMROu_lQOP;4F
zc@06k5PStG{F3F3B!=QTwvE~POX$FR%F5`vEj<-lmgW-sUS#?%jD@&ZyssvaFynU|
zSE^a==Zk1ry7q118rGzrWh@gLXN)Wc=dWqwt?TQi%6F7Z2lo4q<Q-Q8=bRTER~$$o
znylo|`n)A;erpQ4gAFxRO~KB>et*on*0T<R+a0?p`f<oQW2NLH1q9}_*_so<(}i<>
zk-%#dY<^Y|d{OjRN#UC?6#FGFJ9~x)(f((l_$4(L`m*YaXgEGZjRDIzK2%qcqq|Ue
z2b(AQug{ju_&pE32)-G!1pTLNf1Orc4dc^Tz!%l0%M8zG{S$4xeO+X8)wa-{qJ8M_
zjQ51+l>4;ntn-5NvMz-%yooMn&o95Xw5h75zO1dJCm;n!#}vVB6rI^?KVTOrJZ?E<
znXNe<dZOrT{)N0txmVHQXCU}IGAIP^!ce5}O<3tKEBjp=*q=Se_<4(d7;~ZbG<+G?
zYNh$?Deho@_7WEAwwgb@sjS|Y&G4bQPt&p=SCA#z--#DdUNx>(2k;`2Pk)JP3|ses
zHr6rOu(^6m+0K&bLK6D<jv54A=be`vSCtfsCTGoWt-r6Nr6L@zYz}r5^%V^HMj^N!
z8QezcBlOsQ$R<(<zp_)sXA92zp7dT39fsg@34-XclEOFGqSjv)+)-j^@)07fF00v)
zSn=01HWat8+2hN{p=cdGDkqTBYwU*0j8kl%+3Owd#w_&2#WijF;@#DxS*zJG+J+|2
zA?G)liRts%H?^_OiN;NJQx)5Tdy4i34)_n}9n%TsoK+(sdMsn1Eo**b{(<7Q^7`89
z1|+aMFyJ4~TkRcJ1h+zP4|;sie%Lx&d%XNq$(g`8zb=8I!#Cpu?^tCd#F`+5$^EZj
z*Km^ku_I{HMXZp_hEVKXY&eFko_ux`NALnV$UgXadMPu`JIR-){aJhqo<#N}ggtHh
z2N3Kbt<26|+W_y|iyzD!<FVfQ%i37?MDxb*RMpn9T}a@*f&=~;MnR>;5M(42Jw^(%
z<~QzNRn!q`tgekzAc4ib=y24##vq6_-h&iUz8ZcfOBn(4FX$484kr;*wnX$ewbMbo
zE5wz150Y>SN|!afho<rA?WTu0`Rv-RLJD`H=_k?k%jl-rD>?qxf=f-B_9d*VMay?X
zu$LdVbB9jAP+Gq!{8~0d(BSeN!9B%$1Ef)76g-}LQpQ0=@B(^##gQ#P&$w@}urt_H
zSyxxt7-}u)3iKBYi5<zsHPyX|RW57^QV72a_!XTkIOn_Qy(C5J(P6P8x)$Go;LAwi
z!u}WGm^VN#wjv3_5LBYUQ6@TtPU`p6;X}0sAAxD4^d!ABdo7IrP1C+;usBcVc=@Lb
z_u<XlKWY6O<Ly|_MC<zc&9zgN+d{ibrm-R<hGi5q2%d4xsgdv=_zeZROIs=;XmEK;
zumda7?;pmF%o7w_GFyAB;$-lN;<LI2d#_+c&|zgq41!`y7WY2@qb@XcJ(93b_=$#%
zDc3Wf%bAY{S|>Qysk1YSFJJJN#qqz3+TYL38pUa-RI^Dnx{i43`x!S+X=A<PZR=zN
ztlCn(QzTI7@Dbm!6oTiRS@WC78xHi8w3ati*F`FuL;{QYM2E2>YjY>#GPun#TX(GT
zWaw1!nZh}2h>}3hHPK;3@D`Tj1=}60@q+HEU&W2P4NYB(U?2!G0!qgF7F;fJa~$FX
zEoaA&lU?a9Px+hSADJBgyQux047+{YHr(_J`0RG(Q{%|^Zax}*?UPL#!)S2D_Rt=Y
zz`%Z)$zVs~GRT~I&UH`xdP~|uja7BERSjjWNMJWQJm`;NN7i}_g2L}a=v2w+LNr)x
zh%SLjhi}SkFitR~vx#5)jP*^7MkB&cMnG}uk@)FitH*86#{;dI4v!**X0P<|zbWk>
z1k^@^t_`Q6Tsz*A;)FlI2W~{h$NI)Qupp7mHB*(?5F}7p5oR*8dB?pc^l=b^+4J*8
z3;IerLQR$7T1h&lB(PWMa11+Q5LEoiPU$0{vLTFto~NYby0Rk%!KM12hp<W2LB((0
zf3jK&*xW*}f&sA`p)`9f75|&k{xl=3gcWjs#6h|cf-T&i5$+HCI^icHAR3Gf5eY2Z
zj~!N{V1nR1@Qda57k7qQD(Y+M!j-xNVnsxUWfYY8G+GS5suRMG5l}Q(B=E9~frbvp
zWpL^Kd$E>v!f)PxvIpaW+mt=W1Hp2g@6rvZGvdFNKK?hUeYroA%wpOY8uB$;4sM4+
zuot1+)-v6+zwtoBA$1&X;HWo_BlR2+{dmuJqv&FAuKaZ66P2f`PFA0&IZ<=G=6J1s
z9IG>ql5mVY$?fc|O|!$cKYW1WpgInPjpK0mu%vX$>d4f6sRyX|iT~tg2E0LrraH!X
zufyeGtd~FpvzPAwmp=YCsr{WO>nI~cJx*B!F?f`Velrrjp?h1$uC_fbdz<$)?-!Dd
zdh$t$u1Y?>X?fLk(|gtbWZ-<!T*=weGr`lLC(2HhpDI6D!BKg#@<io{suNY7p1S((
zNHjc#XT~P>$G737v6G$h-S`VlbL@qx9NE#E4QK8@5vKG0laovhdl(k05RY7^D;I+1
zX^Dj2((%8vZ^XCD{aJ+sstgMjAWALACT+5BbN9B+o$b5Z_K><$9SM@j{JyKbXM4%{
zocn3-72l=&iv{Nk=Zel2pDj64db;#<@QKi=(5W&xJcGg3`o@m>ei*I}kFz_z0ghAn
zs%=9mb(TW&!SI9by#M3|hV)fPpo-wPc$`ilX!g?mBMtw_H1WTw_otf?q8uaaw6QK!
z3c-nijlEkSxP$b&$YJZW&M=9jE|K`}THdg|=y)#YhUc31O5P>^#rz8e=L6?-j>Tt6
z;0Q@itf;H1rJ=FCz9%w(SH@^~Efgn_#*K>O7A2G09~eKg^*J+sX7Rmd*N+cw=-=G4
zwR>CFc0G2lZFed|MRH%W<bQeB`kL*I<Cg0g_tT!IgrV<A{{<+X6N-grMIMXYs|$L|
z+G{Df*VfQg-xpbhU%^Ot4f;HxC~k;zd{FXd3uj)RGwZ)ue6QK{<3sBPHua?tL=tzm
zG9soNKVWz9L(7}iyNck=oa^puxmR_DzKeb&vEUpW=L+3x@&`&gE8FTI*xJxp--9L(
zB8Rca>I6m6=?BG+DMZHgIkWzo#rK+BKQ=VEYD52~-l=XPMv+0%XzPcJh)D#+|L-g0
z5PRMFitV=Jmh-0ThDf0>^j<;|VJP}s;9lz|IH|0ox;3H*cGP#{6*LfMI0{42=ZOc$
z&t!ehtp8^5y=K>s4NnfPmux>W3d+%$AQ&GJbvgV~c4NQBOzACFm0nf^pLIPWQV7E<
zafVO&F8JMJzM;b2U}r^pZA+xN2^s7_k9+ZJSrr*layX(S@qx*o$@-jG|IOli&8{Dd
zPN2aX`{O#S2x3c;4>cl6{)11k^zT~Uu)b=0$u2SoL6Jhy<OD<axGzdTNOxIBRa;$4
zJ=I5B8rvb*gA^t+d|>=c*5}OnZx-KccD>>!8Vo_D!$Q!oCG!;9e~5R=H`$vZejgpa
zs|Y^td`|RONukd0ve!M47b_qrDb!WbUeg**5bSK|MhbOJ9>S(PFn$s}m|=a+tp8^5
zy=K>s#jqfQ>xAFD4yTTUisFy(MfnzU%@2{lH__o&Y%d~%ae}%O%4nEN_Mwsd!J@v>
z?(&YR_F4!wP|8|lQ1n=rLKzJo7(YFJddc-Uv;LdK_nKX=_zkUFg$8pdJFJg`sUu;>
zkMN9={0ASiyCo9%Dx;u5Q0cKqA)_JLheq>Na3&JiQQcPC8i^BZQv~BuXrwhawIgQV
zU#*{JSf4Yy-dMj{D!$k3dUE|qrcvF7B#(j%M#3NAr}syUfS)J{e9Q8sd4i7T^pVhY
zov{#ps|f@_gM(e=9hK;CT}v`SU5|So96wqA%dkFY)_=43UbE}rH#{i>#fGHnaLP#d
zXYl(jBj8urR}~3-10BAL4Binf7J@PoG8W>755EFJTT6S&kU-I4?1&+Q=y9jg;|Hd{
z!Y{-6oLT?P;(N`m7k*e^G+5b?B?<l+{-fVz4k+^e0b}4>N{3&uz7!{jHHP3bF8HnS
z5zM8+tCa+b4r51}8`U`2rU-UENcofX(<RsE%=$0u2btn~&8~-^%wXm<c%cr9En!ah
zXW9>#!F)Te!6JcgT1AInu_Y6HR_7OkpJ;HmE`jLqB7zT4{tmO1wB-7nS^xF*tEJ<6
z&8~-^Sm0z0Hl9WdJEBHH;rGW_kZ+*DA2W*)3Dk91C-?$dEHX&;ur<C|{*VNxm-d8o
z36xO~J2EeWZK}vuHVn<ae|&vu$@Mw2>-G56rQ>_eu214OuffSPnK(t^_Zb%Cn@WQ}
zWDI=A@+La0kAg3v#X3Rwt@VxMs}Ya{2}%O7BIvN#k$HmIu#@+geSfp{IkW4{;(N`m
zho5eNl>{maB4ePwqL3tk{{>&}&+w=H7IT<SupvmGCK9N0SmrWug2FFv6n-)S_QoX;
zD-zdXnNO>6@PY9&S)ViOzwpZx-)nZg;-_2S#4J!zoPQco6oudSlm+=zY48Vldl?eQ
zC@7X#5hVNCSRTQwY6R><0=r}kL;@28Wj@{1^uYKnvw+7k%ZF&$*8k|aI=FXuTWoV|
z<H-6EDUx86oRG2;NrflYy34y82iu9s?v3@w21Ztm3~~&O49Ab?h#avIsvdL?55|UL
z@=I)F$(Pyv$8wnFILOq-GRuc#o&SEMX|QXsZ)jj>aCm5VC^{UC#yF(!qsB2Z%AwtI
z?DKCe-B`K4b}};2INm(oI^I6sG1fWO&CxT~%hAWty0v?K-{io=AXQeR&WcfEMX#<h
z|8J(gt8>dN;IYi|AzAZVTQXcTR8NRz`%ot-V|t>!!+pd3!vn(u(N)n^vB4N(Mkw8o
z_GQbH&e^=_z|NAbWmA=#YBtoZkF0B$Y@BGGXqjl8XcL;_o#Rc@9oxDo#?rrGKow&d
zn&6-u3oKPRme^QK4K1epmRZ1KndL*YZ0moF7mNg>RixReC&;j6sBNg7ki)KMcT`9U
zOQG2x?PoZC-ge$~%zH3@x^QR7_RyA!sjAJ@o9i}(H%2zpuWww}G}#Qx#{F%(I=6Li
z>Dk=3seeQNdML7tp(aw3$t+FzEwg~fGRuc#Eq{7hl;KLGka85mjYuLChgDe=!v7UX
z!Z>@GV!vv6+CG<aJom6~f5G0O-6cCh+scIERQ2Xso#V#(jg1FLdb$&WTYEP5iorq>
zDGOvU)Jd)znl$CN%mN<EEFY3Jze(Skz(@&6#)M#<V%RX;q$suu#h6|er3(}Nj&{p>
z#c?+0WbRDf!Tf!J>7qR)JA>QHww7<HR1|CA*l>`fr@K3Lkn(wJ_f+pDq)?}rEQwGw
z<(KThXZq&FZ%9oaxcI!;U$V{Lxh`*Q0Vx7YAxK#i62+*pD8mNBmLbZb#GuGS$_LuZ
zmK(Mwoo66;H1Cjqf5E;&VYn-ZB$jWfP!t;uHSbg9E4Hho*}5JZ43$3X+HCgymHWjq
z3V19`A87h{3*L}5KQBcAB$;!NRDnXUMi@rPzXZcZ#zfKQmf=<=iC-bX>@Dll_KOfa
zm3u7ji0@$j{=nYC>Eb;lyMsGJN)j6mHIe*T6%Wt}Dk)Sln4l<Q;)1Wc_}9e_e2d?c
z;V-)T+kKZ0L4U=}>Hp6EuS{Rk-^WIzEDD8<3en+Uv@8a};c5s<fg^*VP;6nD>kkOU
zpiG19s^h%tjQdpXaqo=pkpDozzQEohVYnxVBsLtD@(c0u6-K(<WGReO%+PiHl(7Js
z?7(Mw(fk|Ir4O7xYyMNV{3weeg^ff4ONT>>AVoby3Ww_`w3T4UGS{a(n!JhxUbkP?
z33^U=kLDfrAIKL?jx&Vc0k%i=@)cr7^uw?wIz9e1?Lm46KGVyLH!QP!NY?TfFN+c&
z!KWfPEHWqrl@yXNfjuFP29_DV#ltb_wvfPUj!VvSB7+b-mN$bQAHbf-XgFPexba|8
z`HCch-CGjYBpZH9Iq(^PO#V-Hf9u5`=pM_5+!Mbszi6<OMIoJdTn3{;P^6IZJi-uu
z-)4E?E$tN~@P_Rv$CJ)E*J&ZB^w^NXeelDAhy=!q2Z$v$4)j>fX|m(Dlmnj$IsLTZ
zUUi@4L+*i}D(jIyYDi!(suL7jq6iKN!7$4VQps4#43MH6D{|Fw(RnsO@TgC$37RbY
z#DXN1uh8vCT#t=8jVXjo4t%B;C%eDhSNV|a`NhkktX2}JM#11Pso2qCouF<_YT@@C
zEC_`{n8iq$1}PMB!F4uAWbh<1D0+-F5q_$GPjc}9!;a_-l{GO%klBIH^zwQ4xBDs|
zau57eSr5@*T>_O3&l6N5A$m+A4M`O9J{tV0lECZe@MY)u6oRupTwLKN7KDR&wC?aQ
zKME$K&=i2F17GGAmYF{AUiqoQMluE}36xRLkU`y+R4abpU`qxK)?QK)C^{^5WL^f3
zdym#15%*THARP1MD;5&mW*Pu&7QmYspH|ey?n#CK;(~5GG&2MMXFY#mVVN}A6gEPG
zbqN$Zyoey_Vx&-rqzrvSyK8xYG4KXDd{x;IgCK=M;{3#dhy>0%+$7r(q>%1s4gfX_
z;5FgbY_XR>aLEATuuaS8V3<8WDeFNha?xPMK%=w~iBgkfFj@saDHQSv8zgU1DC9-G
zPzXAFg;2UV*BLbqp2|HDQSzr-5Mv%4*WzS5BK$H30GkHzW(G+3ux1J1L9;{vG46#@
zKV``P;B24Y^s*?5AEn78CGTpI%PI*JOKfOyh2r;ZEQl%;!YoFYz^Cmp3SJ-(hcXvf
zl2bZAV+2f*z+D|l1mnG5Isn)-fHyNg6+SGLDV7KzR_=vDaESomd*Nr4^)MumnRJ4n
zl*toYO!_6gP{;?2fO@GP3WZp&D;>Us9f=c+^J5i8Ka4VPN(Oh%7lTXm-a=$Tx&U3%
z0Nz9kX@7`^4-3Jx0mO0lLLrzo0J!{R$A^}$v$30;0Lw^HHJ|?KM->-r*`&4lVYZ19
zeD7JZNq?1^04rU9u4w>oT5O3w#>0m-j}?Mx0*K@8g))Yu4FGPO^<Qz_wtdMW=@BH2
z(tMgKBaM<ufYm(g-|swUHA`#t-E8?4-Eh5Xd(UE){_0N>9o2tW9H5)L{+Bj@cR>pa
zeocfAYdLlZE(#z{xEE^V+)on#+`hZ?c-}?Vb^8m}S4bWr39$53N|KN&DFIe@V{lF0
zPS;WUIqOrFXR-L{(^_Sn=qY&7d(-)n?G5X@mQ3ldzPF&e8nH2{0l=mKyo*{$`9o6p
zu!OcP3Ls8$FVqN{Srh==wYBm<;Dq;l&Qp$OY|j&LO9~d|zIp;IN!KAsPrj~wI#5;J
z6By6k?mFN&Zk@ASwxmyM^-;F+S3c*z>VD4gqU|;7n@aZQQ*9-uzxr-UXZ2rF19TU!
z|D~Q!(?If{Qo@I&{>*{^V&R)`FVsjhu^<4rcXRFTlEe8YbI;{mc0O&t3BwmjNJaV!
zMbSur_3>~h(h?lZpUB<n+UuCHpM>Eh%i^?FZ)eM|@^s)y@6$QYIqul*T3@%m1;hFD
zEy=Q<+*#x2<N)1t>wn4Tjcb<pe?j=L=E#9yQULJ+_rgU1z<nF*ca-fbob{c|oy)o8
ze9Cb>&hQlrscf*Il2Qq2jFvV<D!PiI{>j`eu04)}_M^5ZV0h7zn%3%W*8H9*I`6yU
zx#7A6LDFAY->|$5LrKkMB&m@!LZ2?t?fC%R^y`1~=a*=}`16ABVeE+A4Z%bJ@dEck
zu_Gw~zys?Ww^r^hIaqKMf@gCsx~@2`Id0fUfMva7eHn(L&rlpGYOASf3HAj>eUqLk
z*DmKi#}WH+>l4;<Fq}_o^+LA%s!o;6<zMn%b3X&Y+qRc&uOfwS#u?71zxt}t&qe<m
z0lG`A{~6~OU6}fZ)b&%shc&lN5sU{A>wJ^k3#s{$900s(UGr4U&d}bX!}-VZPUW6)
zUvOPP3ZJ$=6K6<TDxtW#u(KlESl(GYm_M2~>DipK13f-upS7KgGt|>s-N=?-^~ut+
z1sC(Kc%II=>4c#DCG_|;E9tM|42>i|sohQfO%KqO^_PqRyvdg={yBO5G#);TCANEz
zK@~tO=NGsaN)@w20Px^s>!!Nx6?;nd2WI@o;{-1`uOfxlZ92o-l8%M6Oyv#Xs@7m{
z;gElgS0^~_*iRB|(pp(hqstd8mq?6~WuP_bo6Z!T_h0fp<-YDR2uj+VFQLiv3_n=h
z<D|bxfNrMsKNY~6^!_wI8tbPi;lrBCVe?=s2q4zaFK{nR3IHCQY~L8!TD2=QU39Qu
z)_2@{%5%nj&h?~Im%<xyhA*rs=m|E})HIcKmh=Z={<Yq9$l!L@ZpS{PaK=trD=TTO
zEElrp7dl-umyZPIUUT1YJ*#B!MVm^0r8N01i=@9IK~u(;NAb%Tz?<><rNW1q%h+-u
z7)s`w2>;bcwi6Ek9-8Rb*f3SSvusbvzQV)#M|~$Ccv=X$F2|+thK&SRYxDa`TdKnK
zm2JVEqE-1LzA;5`n~Sto4$@j-PmT-2toa=;BY8=|1>Ys_Rdo0n2tJP%3qh6s3MnLo
z7!1u`4`lsM$}gvc4{IKaQxROky-@0JsQ}=i@y-p6o9nh!><&&B?GGIBAIsAT&S6W0
zpp1oRGHIDg+A8Y9)y-ucC4GS*|7hNrcRez=%|%)(^jMd|6WQ`p>6^|5&ikK40!4>O
zf0ZDp(qAEkl0xhue+`d!Z)hgazG@c|xVI1;7J|7VgJ*LtxGpI@W-KHvQ*lRmV{L6i
zMO(1DcpxzBC#{ttxYf1Oxd%Nat(ATDUig)sDLhB&4Us_6VbWhIf}|c|Mx)YSS<MrG
zeZhQ($GX?IY>Z4*?<hwCi}wW%p~I56L??J2f=UX<{ewlFp(aU+(;Vt3A+1$G6gx7W
ziwwHAx^^%poJNliWzSEgZ<<4cNx+5#KAoco;)Z_PZV)61+ZT+-^y7!ede*f{`jTyx
zyGWn2w`hOC5k>H1oS;4uUKuAWwL92S8L6voEN=~VVMR!5g&moI;3f!)E!pGPe=q!k
zr&ao<iz<1}HFQ|)2+5n`1SS2|tEL_Jg5MAw>z!=d$OyQlYNsxNg$MIz{33%o!81AM
zDDQiD!be)B($<OwG`O+_3G6BAM~6pz^903~?8%m&O5cP9iEA+Fuapjp9We-sEqUb&
z#$)>N@SAAg&_s%i>g^S}1d0yN`i_x)C{EBl=ei(itw_sME{W-?nn=skg%uGU9?e_J
zY*3d$2;LJv5&)9G?!4cSK+$0mQ8Jf#-thuj3_;Tle8F#sj`dA+Y)}cD(BQH?G6s@<
zNM<rA1Sj&M0hN}iDqL67P}zzEb|ZnLwZe|f6Wn?a{8aiONnodJh%SMm!=%4T5WMpR
z<1zhs_%VY~37oc6i4D;cI33JCqI8(Kj2Z{eOn74j{UxMjs;sZ8C9Rc`K+;;F!!hhg
zf?)Ris*Lna!tY5n29o|tje<!8O*`-fzacu-Ki;{%bz{Tkx~b}I6$uFx9Zn_)Khj!t
zg%|;AYQt3x6(WIM#iX@Dhexm@GM`3^?~R{W5T(H~iy{4$q_7hmW)zIe;1`U?^y4So
z<L(LfB*!|AtV$PUERmwsRd|{`-R=SRusiA=aj$l-;aKY)<8#J6<DQB55iaTR40>W7
zIj^6caO>wL$#G{K>qwyVaKBcUP~_LYCT}ffV2tUUen?OBKCJV|kv&)Kf^FE}7_6!e
zMM|5B+6p@hx(j;p`}_m`RsKQWkZ(9|I4|mrdBd^#I*d?JZ^1zRpnu30^~Lf=@<zR@
zIo8OLyOv`t_u;0eX`|J#V5~?L`Sq`svhDG*=VQ<u<NFKG365+D;<d5gxhAhC&=QPP
z)YMdj%Nv4Cr7b0G#T`YRh24RkKyRS0pg%vnDbn3o-%{0H)>+z9+*{Zm7$_LbAIcvl
z*C#AFMtq}rqsxjd`dzRet&df*cPr)NDSW;<f3+WJ^sSxesPp7OExUu&<8#)XuJPQ#
z{H~&wU}Jf>s-~tYTv1=%7-|Z(l(v?%m2{MJ7L!aYTHn#w&|KSA*-=Ib`jXz_zM}rZ
z0S44T4oGrD%@ctAE+A$2jM3&;U9=Ma0V!5X(QX(PNa1^-C>$vuuLx#GZ@sK1-O!HO
zx4R~CqyD}?XK`z=sjR*-TwPmRT~`^YXb_H}mJlh}#v=Xo?TwAib*<Iy6&+<=q3+Ti
zgJR*TIL8cM{h>HD+7_#i)<Ccff+ew{SmCH*h?kH~G0rh-eyY&C^?+lmYchA#H<;gB
z*jdsRY%Xi4h*Z`I#oDTHWqk#y0N3L--P6zp!AMJOTNMn;yFy*To;XD%jfD>dqzqqT
zv?JDt|F@J}En_4sjTS40g(8PYVuE7U{8XWN%W>;o#}?P5XEblf-yi5H>JWyZ#`1>B
zNL5`;U2P316jcHF`mTmnMX;{5roE~|7?yPfyLE~seZ?8R`a^MQv@6yUtH*n}YB)|1
z?`9a{<5##^*Ji~qYksQGyf$Z<vF~wAL2!*X=3kZH7w9hTC}|D0gqq44D$!?Bd#VER
zk-qv)2sSm<w}e}3+p60u;|xPO!%}j@N;d5Mq}e#kaU^lfuy#Cy|6a3YBMCU<{Gl}O
zFoS%K)@R7ceoU=IFK)k}J8YAT=KDx9y|DcwTr=7;(iUq#0&9{8mJW+&PzuAW`6UX?
zTMycIIXCA_de(YJd_(>L3~v{vqczwXLYqm8N&)%ss1%T|@0cejQaI1BD=1~+J4xQU
zjWkXB;*|7$=zZ1p+DjTqj=O~vs6zRG;yC{fGk#OJ1#M7m?@ix62?D7yO{VD`iEm)s
zDY>7cy(8_hrf5A;9My4xVoOS+B86g6@bnWoya#@&&^!`&)V9yD!?ihQooCEDnm6oU
zRUn!iX9&M|0eP&5Dj<&xHjpBrPDx>cq4ZyBBMjg`wh4;6=!M=>A$3s0lVq<YAW}L?
zM)EE6QaH}P!;Iey^vcOayhfWy-@Zx^m`7y85Ke-d@I}~(w96;#S9fc)Z=^HY5~F;4
zv=)MuN{e-Zx)j0?O)kuuUs9oY>k<3j1i{tbs9(lHG+7vul2;Xw532(5jDo6wyg?8t
z6ir5#_5NGP1lvk_-8C@U3`L=|NA-d|rt5Sp-=~Gih8u<Q1~hX19cKKF(N3S1i(AGL
z?Z*=YTG@<i;VzD%3mdsl+u$hQVYGjg^6_yAtcg-EG)@p(f_J1Yg;?dR`6UX?BZ0@Q
zhwRgi?a1JI4~6C#33Vw%li`;rARm#kq4NY~B*dDuDNUa5e-VjcJ7KsQNfAa{+4I?{
z;jeHP-_Zdr-(xMqm$ZkR*&<6D;4JSjdmTe-*Few0mTubilM)1qTSzOkql_0qQ9faG
zV3hLlv1TMNf)#<_P^GfPA+g5Fzf|<N_#XJB6q>i~x9@gr(+TFvNEr1GB8C0%TU0>a
zkill1U@cNuliYuCOlgMDAW|?!;?9jw#KY-rihb<B`!PSqUWl`DOX3}qP&9kZpotSu
zDdLT3+fPXlm_yFpI+7UoqRsLhCUt%kpN_?gcca7ALlV<L`FIW)36&Jy1HVL}dF`C#
zBsxr?`8dG|2#)4aXx_hyWZLlp@?i?dV@0I+bO*CR3dlD#LQqCRq%gJrK9v(&|NbLL
z!Gwl?*!vVdIswjRE#Kud-XWA@NS4_vHU8JQTKFW||9}!TXHi(mk>|B3$AkzBdq|cX
z(?(Z~_QWV3j|PhbQoI`-u3AK}G;4l~3eBU#6q;u~y_w}5A-Ebnj>2zI0eN&dl^}Y&
zu>W2ppoY=B13i*H;=Lgnd<iic!TT|vyxA%@cms)`rH5v(h4H^>+7}JJ##ChkVpq+c
zI-_MVnotkJZsFGpKhfaEAyqzJbU1~e%&N2Jmr`h6Ng##h83lJbx4Je#Q06mhyd#QV
zynsBj7+nH;gdl}~4H-m_7xzCGU8`iQYM}~9zlO)`$M`org9L5`7d0jvIUZ{OIjw_o
z`{K(N{AF?c@1pj<f+ycCO0n$4PIW=BK(mXRV2JzJpp6bPgQ0vpv*<=5q2m&W4$FMn
z&|(N?Eq_M*TT-EUba+3rX$s9J2(E?Sf&%hb5mi8*Q7}%BId%Pl?y6ticuAFMwOETb
z;r9vtb+@&POlH<0ay44M-R&mxcquYsp8sC_tM*_CR&!~DYh=M}QTy*vbn7Zp?Crd*
z1A+m`Kknr|mJ7c=EJz0$tV^I+5t+$gN0JDJ?tx!Qp?Rz@h30Ll&^(sdAUFoUv<2i<
z@o9q~Gs1NIq=yAmUr>C$#t8UjN^xDIpw3PPyiWcM!5q8vuPpuLNq<fCUsK8L6~RRN
z84?7_w}4;&s45?ykU*IUiVmj`%$i@CLi5aHC^U}_izU_xuA>Bl@QW9a$A-j6Z!8ih
zI^3C%!6oaT=P(eJQ00U59!UZFB}$f^=W#$DC)!Z&e7ncx$fbW~FYyoBhxy)sJxz}P
zO=>?wf<UJx<>TR}%Ex0v8YMTbZbgtleJ-Q4_#XHzC^WAm@R*fC^XRZxVntB#TcUtG
zPsE*AVx`4P_ur-YWY!`6Yf{$!HX3%4DfK!=!Dhhb+dM9($FlVG`WeH&YH|E;Qu|W&
z{bknW=6Lwt&L}v*;9HFm@oKi7{VfNZ4mBQbJlt@^IHc@H{fu!$W;kYhV)dIM=4Cx7
z5$KyOnhnmD&sK6&%~sFWsH1jPKk8=n!!&X1tp1Kn{dY;~E8YL^9o4^cs()jc=0=7l
zzb)74%yhk+oA<9;9RC|{Uo6y;2?7VVb?@q!Zrj_kzxhD(fu{7uKZukZtZVIVjMk4u
zGMDw(r@d#n;kxLZ^PdhpQFN;0Wa)|EiO}(~<K-L`$19Fis-x=IgWxBwwD6Pu83ia#
zqKunS#crk-RHDI$&*YUk{x{zKR~TW*NI=22bK0!RwmO2cHzR|k!<+lIb?@xl-44Te
z0g$vLW5KTamX5~$`e<ZzWIVD|S&!YsUcPF(>AdQG(tF-Fmw&e4OyTLGCyGy%oGLw8
zdNO!2bTV|JjN`%Z1CH+hXY_9lf!c)*4`W9nOfgK?|4rk6rTq*E0;3xSrh0`S5v1*V
z5)36OZJ(0KgSt$vF6ymnZ)k3B>}eQ);b>%SBwbmL?Zkn-XL-r?oa5=7E1pZ<i+Sh$
zLh)=sIG!%nNuDY_^`Q8ft<RZ=pJo={Yj%BX{ov;Qtv%bicXa77xO>{BjY2dq)DK9m
zDHtg0tZi#-Zfopn=&N5<Pg#%1n#forwXDY!(SyuqtuI3GhU=PJXL!MP-hVEC4vJ?B
zb&jVW6hGvxCe!+yS^v%Ad(EyNSvRz4U~``k+#V;ms{?Ib$Z&1`U};ZfM_p@Ub8BN~
z17$tx2a!X{dI&{T97S|`1Nn0(v|@S9ddGIl@r>)~oTuDZaxZzG%)8*b;5#Q2l{B7x
zVEjzh=gj(V7T;@j{mA6dhE<#TMTdppeDMl3A|@no@0fq6xVNmcx;@e&1UnkKk-~vE
z!%-NDJ|m0dFQL$i_NL{oB6!nz9Vxu(Q4G;$T@s=A!1$T0&zbe#EWX$5`jN@u^@Eb_
zXNw{j9|_};;7KDQWhn~#gWVM!wQUg|Pa%jNtFj*TLrM-Q>miaz{tZ=VMWfJ)^)@ni
z(|JQlq0Uf7M3Kbv4~(D5`kYz+&Ek8_t{<6*t{d7oK<NOb!y<({#Fiu-YDApy#R3B*
zy=7fh?RBm75Tv961bdJ|%6il%N~4fJB3@_(9j4HV{g&fd=Q9))yyg~77CBVf{J{8`
ztk0SC-z>h@?0UsdG#DLbCZiwoBcZ`@4`nF|28;VbNMKESSSQ%j&Pdpe6zZBRk1jDS
zQD{Yb!}2QW-Qxt66zZC+7(Ot53krBFT%R-Rzgc{*+4UpiG0F8P{N{BybtF^_CrIi%
zRMcPEQ{GjD4u@MCL<S*9Sr4p<NTG~|L-0!}w4w+~fdSEDkwTr}<p;(uO#zSO^*OWt
zo5lB<T~GRU;ipTWK9ez|P|XPqJ*F&0{%|20tjc;&RDq%tieQ@}Xh`8ImH!}KXay@m
zp%vybx9tW&84E>|uRbt-M*Qj$>vLxPH;eBzyPhO}!cVs$$)n(ckx-ST2&^jZ3nGD)
z=&&TTPY_gk+#_Y8;FnTp1s#6bAgD&dxD-A>exLC4w&ePpS^v%Ad(Ezg-{i0m6dR&O
z!MGjPrO=pF@0#?kk)-0H!SS*lZDL0nnnjB#N&!L9<6ihB6<Q(1yDGGjOi=0Z(+`ZF
z$@-jG|IOli&8~;vgyN@baOx;14CA&0em+%}qJ**(B7xOX+z5h=Nd&Pb+M+@$=rDy=
zm<=i|Mh0ahyzv0}Wn7;#>%Uoiui5p&j~UFo1~1fMWlK6m{@^!Ms5CfU)&m`0L=b){
zg;tP2DVC%Pttf&rpSkIL=7I9dus&zje_8*_6yIxhJ^XYFJg>pV(}-b5<o})VvJ~;M
z9&rh*L5C?yk&r=b3H%lnT2VTjO7PhS$S>pioLT?%_0y%}d(Ey-;%C^9#X1Z@;it+{
z6j7EUsK!9CB9!&e39@S|<6xWOmr`g&%3LTN)(O6VEk*_(D8CHrb7uWF)}K?#`zFQr
znq3b+-2#gaZ;IQH#X7uAm8Hm6WhqMZ!ZM73l=X<~uso3{f{NdQLMzN;UMDtRMnTGi
z*c8DB$}hwEoLT?p*Po^Qa-zI%BEHw`dc{w-z{V^vZif?3BTA1IKYx4#OqM{=VHpKw
zE(1Yri9#!J9ZnE@VEkNWJ<NRGe1|>r^^g2nKkjYIcwJ;O_xJjh#TtDkK0jjP-E@;F
zQv;9sxr(^cz<u`PrzCsawmcEO&rg0xE~{dnEkH(=`+WUM`+srYisAqFd-|$kue^k<
z^!rr8f7bpN8}O&tuKPVDSdF(v5DD~Zj{A|If6;!<g#Qkkx{dd9`J;bLHU2_n_)_>a
z54pYW`Sq`wXz)i^k@a|V{T{xAUpY1;fHOzVy%YR5?Vq&&!Ir~II2Mc`%D;Cn;a7pD
zK`}0kUV=qz_eS8i+OM_0W7Fo#xF7AoF(`Czmp}UN7Jijza0wFVLx-LBM)0@VZ?vCj
zKi0m11fIrqrVCO2y?RyS#Z!p}2XSUAK!@G;M)2RYf6@Lw?XTIikwB8e$`}8k54wAh
zH`w48LIMl1B3?#8NA?rqf7kw1`=$0j+4lMaT;R@2L=1k@4}I9@gCi*XWCScj@k9cB
zxG>)X!GG8OP5Vbiz`q~`_B(Mm^dVfg%sp2hTo=C3mge~du_1+6kvv907nV4ybp_$~
zf3^Ru{T16@UqJ$&#GiZv4wyLdf1%y_oqf2@Pewq|U|j;eY81?h;D2bp)&7||^nXW#
zzd@kFO{~ZsB8wzY!PI&1ch<>ee2|N7gwW3_4q>(M7*!!@J$Q{r7M+v#n!Kj$1#U|L
zx?j#`RH{!&`JAJcu+8R1u7iva@($xY#=A_~e+W)nAy^v!_Chh0WUC4UX=Tu_UDUTM
z_}$fpEPj{O%j~C=`y>=oD1C~9VYOzrFeVf*J_yD6_Xx{J<CRPQN)q2*q`;5^@fDX$
zuN9m%dY63Oq<?=!@K=u(a^$-4mG<!|#fqa)Qk?#l6x1`kqeK=cY*6&`o+9;KLVT)u
zfQiZH|ISY5r0X3Nd(hu9I+wS^d6O^ra~$q2u<uc254b#T@)k?hSe;NQ_apQxEX$h^
zl;FA|hE-k;{>#I>$h#h;siKp)o%F3Z>F$>^?lmEMl4mFVc@uvBr2T|vw;Ak3ovqaA
zhoQq`%~hXNz^GUJYwhnTNOYTbOt8jP4YFK$$hIhoUhdhW^m3p-#+@%g&yWwSMlJvA
z>woFaoAmED+W*02`+00`mlm;>JBpmX9Jkx)L7Q{=EdRf1zu@WaE6faMh&LWkH>(i5
zPFXLs-rt&;$yfS3PurOr%Uu`Q5bY^JB$C(v7N0lyHuCiGV;qhz5x&`_HCU_crOtpW
zPcgK6EV(i}{GSkhTZOPru~IF85#@ZmEce{wVui_f`JJ4$GdC6~^v4;>(?|09-@@}I
zUrN@menQE=yV?_Ehv?E8tu?kXq%hy*g`vwM>ob3^{W(_TRqZk|xRnsaK0;?|5&gnP
zOnu`hY$%UgNnt(=g<tCWU+Q@ie!}OcIE{V8%z2i;n=Y-%QfsR~3JYES9B+=SFZ=`R
z3*SRV@9=aoqXOUt31e;Gqe-vwD87VHjvAkQir>QZzvS~K{nOX4{wq(vuOnXvl$B^^
z&Rk^=If|VHE`5FBZ*U9v7SG4Gu+=k!$*x0>dk7*7o3#GWp0+V^=b*<@Q77H{-~4%#
zE{J^nf~U9d6ZUhP0Ly8f-8!{qOT=1jFL#tW<@xxZ(BQvB0>4U_>`g-8Bt&cjq0|FN
z;UoJJ+vh@$<ryR0`k!&$q=O<~zta9z`+tzHw+K-+g0NdG^^AlScK!MIXAu4oGWr35
z+Sj2x!+z5y@>dL*wEj?^wlWcOCipF0|I^Q#bYWh;B=GEIba<Bi^igE6#nND{wF$pp
zv%>l}Sn2N(tSIH?u91#oh8fQkp}>#yO9+YlEw%n9=S^Po@+IppZ!^N3X9a7D=Z#LS
z6@qg9H_UN<s{J=sX#Plo`?MF>D>(zfY4Q*~;xB6wzjW(=rfYz@eEkD+yg%nT<6~wi
z*IB{Zk9NvTuhsG&nv9VD$SnIOtg(I%#_#i<=O`4T6MV$^NnJ(Vf-GbG?>AVGzr#ZO
z06On7OSwWalxZwP6oSI<U$G!RXN>$yBv6uLNU-}&X56Qd!N;%11RfK3OyDtr#{?b|
zcue3ifyV?M6L?JEF@eVf9us&>;4y*61RfK3OyG+mu!_i*AF+D=E4&>4g_X>IV?F)1
ztZC}Ue`r(oyLkBgoE6UBEI9igtbP86>DT@ayBS|4Wr(zquKlByPgyzr6?-81XB(fZ
zw{G6P=@<Ci-w=S{w}=t?U-a--aQZc*^qvaK0{4*RE<RL0VTJvdDQAWAZ%x15HPS{u
z#oqY$*kMc8e%LmH@4;WOgZT?QIewkQSig&=AMg;o!O4!;^Z3XA0E+*EmG+;pANEW3
zB>pclX({l#df3(eGCnFlhT=~-`wPYSpA~7dum4r(wZiauq%d9kHg`3v^l!2^@?-W$
z{vKZhxl4N6au*+smzCX|Q|wH9#17b>(v!cYFMo&E_s=!0)ang6y6NNdNYQuD_5X?_
z{VhHH8GZeQY4`p!R~^KYW$jY!7uqd7w0;{Otk2kE`fGMFrEUEV{*!6@56c@Y?B(o6
zlV3v8zCpCh57`a-D@ft{63BH|*}K?}y@VIgN9=NcpWX1k<eV@zdx_uu^U$*`)qYU(
zl(3sQg9qE&X!7^)<dC-i21)-va{nHgmjy@N_3Un6h2gts^B>ZeA8PncmfCUyt~y&6
zzEMxGFZMdSkl#X9eu%XErD^ZP^Zu(U{@0qm{W7h<iSOwI1h27k`5|q84=MjK`@ug!
z&L6E;Et0T@eZ%YQAije}eM7@{Bc%B}?owyO+QqKzEFJ{6@x=I;bN`7E&FqyN|7*c3
zZBhFbcp2J?kj{+|yv!K#4!rey_ZL~*AG6Q@U7xSSYiBcYYUk0Xm*J$m*8+Br-&O8t
zfZ#fOsOIovc^PdJhGwtS_+JaRV?q1ko#_!bn*qEMX7FIUg%{BWcyj!aF8^j9@X(zP
z;iET!6rP|jH-(=_NUq0S<gBtcLU4k;?~_Q%Ewt@j({H~x{&zw965B~2O%a~EF-C|P
zZ2e8Vh~80m=R2nDKa6iEWyUc|Uk;(kieIVbcX)g`rOp~#lcf_6w4Lbjd9?hF>DNye
z|GS`lER^QR<u-NVd%7Kh=V<vw{9xWklON4jG14%Ehs7T8-;fb7V9j;A1FkYht*se?
zYnU+#-7BWuOCSH6)V|0!feUu#I1xN|$GJB%j1brGB6<}md{ke0Z^m?fW%SVN^yK9f
zIV<d8MR1hSYCk$|_F5|bH_^WEE5YB|;$^DTOjW9Ncu&sY!FCldqL<B@e~8Z)5F^<o
z{KWscL~1R#-TAIkM!|@+1sNQ{8c&;k|E1%9<L!%gYmjIehZ6~`#p5)F-0z0q8Kj%>
z_Yr<cJ-ax+Qas6XogQyap)2I5w%03y(o?fnrug4@`)Yj9+!ll|gx7cvp1YeEIfURP
zv*sW2^B6|mdoqY{<k>xLw?C)YS?;K@HCS7b!Kmr?&m8}&+7}DV_@G&H_>gM6#s_#_
zk(#hGV!s~sm(EY}Z}=^_E+nu39j>s~#tE8rPxn8~5dSOfOMHn>bJ+MNh|hfowrd>U
zQz7`Mzm~f`w*umOmvsx5yFRx<;(PC#;d0mKR!n^F{SsW}`rHbN@4Y{Q%UqvZQSrU^
zLvWevb1N*q_udIEbA4{b#rNJT!DX(`t-$!+dn35a^|=)p-+M0vW%Xbg>vJnKzBhY<
z%UqwEkH1`D@x9p+T;}@ReEj8#i|@^j;4;_e^!Upa8sD1@!T9?9{jSdmzZD$cYfe!3
zEn|Id#mD!W5nSf_-0yvS?=lN`EVF!w+1-8k&o8rp$1=-@Jp6pjzTq+pcr3Ggh}pM$
z_|Gr1fX6b+hdlgz%)a3=3wSKEe2Ce%d-%^Uvw(;G_;TOnLmqxUX5TQ`fzRx{Y0k^^
zaT(=9(zN)9{VCOf@1DQBF@3zx@*(&9c@NzMiyipx=_?Aq)bxS(borzI=5!8x_s}ch
zw=jL+JzW0izcsxBU)KE+e)0f)pXEcc{+x&QjY~Q3W%-4dN*|cz<&XZ`mUiIFs&59r
z#pwgHy8O|9Qzi$#EP9tIeP9-sKl*RU?7(N*zj=P?(g&Je{^-BK)PXN^7j=I3T|Ojp
z;}64GvjAYT0A7<m*ry1Z%Md_pIZDvmN)7-v3*a^3=Ps}eL2$_cVp|v|)0G?mY!<+4
z!mq&M^5YV|WB{?-hRgFx4gfX{;LQxMRP$OLAqs#j5kOq1S?us#xdFhY0lb;{iSvg$
z*Iq>^N!kEn+zT}xPF8LJuxS8qW`1h@+2#t^Bp4@605R@`QkiAt1^_3o|D_G!&CE}Q
z4{L6Zv(y@a;GzIx<z6V^mn$^@ICcFmO#p9Ze!BaX$L{hw%B>9$To6F4+zU08XMbe|
z054wuTM)o&#!rP0iv$+gtE`O>ObH;4yBDt90N`}%e<=aH=KKgB);x|JpR?3fV`)ND
zlLCn2?u8oFMpkA3aQgMXqyXN`{0#S_Tr1Uj0**3Ut)*E7O2-3;<L-qTl?Ya50Ps@l
ze~AEIbAGz}k;j>nhYnZR!ir!#fH>h^s1ZT95(9vjUjK^+@S5=x4JCY7%e5ha`OZ>C
z!HA^=8B_tp3HL&o#jL~t;7sd(Du6c|eky!eLjt|7LPyA6ZLL=X2_QD~o63VLGXOaA
z`k!eDT7CaWg%4{UyW5@PcNRO!Z8g>gODi%MO>!^PsJOK90)Usj{%7pJp}`i93kfVh
zhbwHgIzizZ5C4_=p(`%{IJ0^_KKq!!V*-x}JSOm%z+(cB2|On7n80HKj|n^`@R-13
z0*?tiCh(ZRV*(E`0T(+HA+`>yIK<a+;bEV07Zg}Z*~6&7k6zxD`aZL+jgkww4+o((
zvg<BvTYk43wT0cbLSmH4)F&99VSI{N`(ONdD+Ej9-(Kfqe0S-ecGoiG@_VfT-V;(3
zh2wlL&AxvtIbj=N*u?+ySIv{e_isk4w%9%R!4xZsa;NmqkoP|tuUx#ON)q2*_!xb~
zcl55-;|RHPb8SA}RjfFcK~nFhS^xT==T~<pn7;j_1c5euvHW~Sk@_wnt|~KY{-HUa
z!`8!MI+YqFB>n{N$8LL-E8zBc?74{^mck=s`u%&5y)w2Y^OkCVQG!4R&M<i-G!~A<
zrtLpGZ*a1KEqA{->F$?n6odH*B|&DiE?b?m)a8evBiAbLEMT+Dynk(IZUsDKdpB+S
zX%hrmG@<B)`J?o5pg+c)FF{Y0eEo(pCZACPWslZnjX27kML9k<&Oo`gT;6NCeTmhm
zr5!!~H*NbF5(HYI_-MWCY{AN17ugc+DbXlF^h=7Fd`K$mtq^RmS0ROgoIGKOHe1a4
zmm2?@ru`L`AkeP4loaavS)@_WOf>jASdl02e(bU~+G-qS&LUTSju(ce=}C<JT_gT?
zQTr<{L7<J1N*{aj#kW@a_frb$d_*$(S-c;+EKRmrdj(QhnB&j!n#<q9_}>NXufPO>
z(!(4a^2BDU^i@Bj=++-n_~<glXC|~xOS3I(uX2Q(#V!D6e(C-jlD<5jk->$Ffw&nY
zwZ9@01lkz6bI@bNOXTAh6oL9a1z~Pem}466M+jOY_G+w2X=ZwQ`uxT5zbWm@2=Vun
zI$BW)0>$>Z&|~<C2lU@kvg})wL%Kofo}H9W>9n+1>uojm3Uhv9LCUegMT`b{wCGea
zm}p=3ulX5wXoV#RwBi!#6n=($e1-(Rj1JFId}dS;Y_Qd4%WrY~Z@his_iM`K{S8+B
zdz6iOhtfUQD4#OJjA@E80*~-ZdPs@T-%?Id<l~2w+j^U_QRgY$GllnKr==B-dXtwt
z>52zd8S_bzKp*334zg|Kx<vacHbEf0)aBzJC^q=#6ny)bqD0pz%dwxiOiT!x;8SZZ
zI=^)Bzp8y1VgHd?`cEi%_&x5?`+UZ86rVYR44N0XG5Z2v^m*a;8!X7*F~a|V-n~oF
ztt%8hnx^<n6c0#~mo9%9;(u3og23gj&#i#?UK0s1IlavFxfK!LYl0}_>1D3Zt&sTM
zjEFLxT;}@Riiz*d2r0wKWv<Vyp!nVlkTRTF=K9<Ui|@?<YKc?JT%TKU@x4o6TJq1!
zT%TKk@x4o;N_%3N>vJnIzBetXCH}a~^|=)q-@62?G=H97f4Q&qIeGS_yzk13?@fa$
z?Vsk?U+!ytPWY+vzAG=jH!ZG3f7I7s?t6W1MaTCpf|ceE`ufW+*+cqotN{KID_Cz(
xTKj5zb!khQD^mWn!sB~W`kMBaWft&Q@$tQBp(Xusnd@`L{x9WyS7dze{|`Wm#zg=C

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin b/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin
new file mode 100644
index 0000000000000000000000000000000000000000..db5bf73f7d5a0b5e436d336849c90bfbc24d76dc
GIT binary patch
literal 1024
zcmezOkD<Pvf#Dy7Vt@dk2uKi2{R1+90K~@zpc={6kIhU{#3;3&QvIa3l@@A|qY7?5
evLH0#aK#_8QgZae^^nP+)P73!lj-bXqYVIqI9W{q

literal 0
HcmV?d00001

diff --git a/Ryujinx.Graphics.Vulkan/NativeArray.cs b/Ryujinx.Graphics.Vulkan/NativeArray.cs
index f74074390a..3a8512874b 100644
--- a/Ryujinx.Graphics.Vulkan/NativeArray.cs
+++ b/Ryujinx.Graphics.Vulkan/NativeArray.cs
@@ -38,8 +38,11 @@ namespace Ryujinx.Graphics.Vulkan
 
         public void Dispose()
         {
-            Marshal.FreeHGlobal((IntPtr)Pointer);
-            Pointer = null;
+            if (Pointer != null)
+            {
+                Marshal.FreeHGlobal((IntPtr)Pointer);
+                Pointer = null;
+            }
         }
     }
 }
diff --git a/Ryujinx.Graphics.Vulkan/PipelineBase.cs b/Ryujinx.Graphics.Vulkan/PipelineBase.cs
index f779305dbf..583bb9539a 100644
--- a/Ryujinx.Graphics.Vulkan/PipelineBase.cs
+++ b/Ryujinx.Graphics.Vulkan/PipelineBase.cs
@@ -150,6 +150,28 @@ namespace Ryujinx.Graphics.Vulkan
                 null);
         }
 
+        public void ComputeBarrier()
+        {
+            MemoryBarrier memoryBarrier = new MemoryBarrier()
+            {
+                SType = StructureType.MemoryBarrier,
+                SrcAccessMask = AccessFlags.MemoryReadBit | AccessFlags.MemoryWriteBit,
+                DstAccessMask = AccessFlags.MemoryReadBit | AccessFlags.MemoryWriteBit
+            };
+
+            Gd.Api.CmdPipelineBarrier(
+                CommandBuffer,
+                PipelineStageFlags.ComputeShaderBit,
+                PipelineStageFlags.AllCommandsBit,
+                0,
+                1,
+                new ReadOnlySpan<MemoryBarrier>(memoryBarrier),
+                0,
+                ReadOnlySpan<BufferMemoryBarrier>.Empty,
+                0,
+                ReadOnlySpan<ImageMemoryBarrier>.Empty);
+        }
+
         public void BeginTransformFeedback(GAL.PrimitiveTopology topology)
         {
             _tfEnabled = true;
@@ -803,6 +825,11 @@ namespace Ryujinx.Graphics.Vulkan
             _descriptorSetUpdater.SetImage(binding, image, imageFormat);
         }
 
+        public void SetImage(int binding, Auto<DisposableImageView> image)
+        {
+            _descriptorSetUpdater.SetImage(binding, image);
+        }
+
         public void SetIndexBuffer(BufferRange buffer, GAL.IndexType type)
         {
             if (buffer.Handle != BufferHandle.Null)
diff --git a/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj b/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj
index 87f14a6ab8..57e2240a73 100644
--- a/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj
+++ b/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj
@@ -12,6 +12,17 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
+  <ItemGroup>
+    <EmbeddedResource Include="Effects\Textures\SmaaAreaTexture.bin" />
+    <EmbeddedResource Include="Effects\Textures\SmaaSearchTexture.bin" />
+    <EmbeddedResource Include="Effects\Shaders\FsrScaling.spv" />
+    <EmbeddedResource Include="Effects\Shaders\FsrSharpening.spv" />
+    <EmbeddedResource Include="Effects\Shaders\Fxaa.spv" />
+    <EmbeddedResource Include="Effects\Shaders\SmaaBlend.spv" />
+    <EmbeddedResource Include="Effects\Shaders\SmaaEdge.spv" />
+    <EmbeddedResource Include="Effects\Shaders\SmaaNeighbour.spv" />
+  </ItemGroup>
+
   <ItemGroup>
     <PackageReference Include="OpenTK.Windowing.GraphicsLibraryFramework" />
     <PackageReference Include="shaderc.net" />
diff --git a/Ryujinx.Graphics.Vulkan/Window.cs b/Ryujinx.Graphics.Vulkan/Window.cs
index a90a824dfe..5d6def3a98 100644
--- a/Ryujinx.Graphics.Vulkan/Window.cs
+++ b/Ryujinx.Graphics.Vulkan/Window.cs
@@ -1,4 +1,5 @@
 using Ryujinx.Graphics.GAL;
+using Ryujinx.Graphics.Vulkan.Effects;
 using Silk.NET.Vulkan;
 using Silk.NET.Vulkan.Extensions.KHR;
 using System;
@@ -29,6 +30,14 @@ namespace Ryujinx.Graphics.Vulkan
         private bool _vsyncEnabled;
         private bool _vsyncModeChanged;
         private VkFormat _format;
+        private AntiAliasing _currentAntiAliasing;
+        private bool _updateEffect;
+        private IPostProcessingEffect _effect;
+        private IScalingFilter _scalingFilter;
+        private bool _isLinear;
+        private float _scalingFilterLevel;
+        private bool _updateScalingFilter;
+        private ScalingFilter _currentScalingFilter;
 
         public unsafe Window(VulkanRenderer gd, SurfaceKHR surface, PhysicalDevice physicalDevice, Device device)
         {
@@ -116,7 +125,7 @@ namespace Ryujinx.Graphics.Vulkan
                 ImageFormat = surfaceFormat.Format,
                 ImageColorSpace = surfaceFormat.ColorSpace,
                 ImageExtent = extent,
-                ImageUsage = ImageUsageFlags.ColorAttachmentBit | ImageUsageFlags.TransferDstBit,
+                ImageUsage = ImageUsageFlags.ColorAttachmentBit | ImageUsageFlags.TransferDstBit | ImageUsageFlags.StorageBit,
                 ImageSharingMode = SharingMode.Exclusive,
                 ImageArrayLayers = 1,
                 PreTransform = capabilities.CurrentTransform,
@@ -280,6 +289,13 @@ namespace Ryujinx.Graphics.Vulkan
 
             var view = (TextureView)texture;
 
+            UpdateEffect();
+
+            if (_effect != null)
+            {
+                view = _effect.Run(view, cbs, _width, _height);
+            }
+
             int srcX0, srcX1, srcY0, srcY1;
             float scale = view.ScaleFactor;
 
@@ -315,6 +331,18 @@ namespace Ryujinx.Graphics.Vulkan
 
             if (ScreenCaptureRequested)
             {
+                if (_effect != null)
+                {
+                    _gd.CommandBufferPool.Return(
+                        cbs,
+                        null,
+                        stackalloc[] { PipelineStageFlags.ColorAttachmentOutputBit },
+                        null);
+                    _gd.FlushAllCommands();
+                    cbs.GetFence().Wait();
+                    cbs = _gd.CommandBufferPool.Rent();
+                }
+
                 CaptureFrame(view, srcX0, srcY0, srcX1 - srcX0, srcY1 - srcY0, view.Info.Format.IsBgr(), crop.FlipX, crop.FlipY);
 
                 ScreenCaptureRequested = false;
@@ -335,20 +363,36 @@ namespace Ryujinx.Graphics.Vulkan
             int dstY0 = crop.FlipY ? dstPaddingY : _height - dstPaddingY;
             int dstY1 = crop.FlipY ? _height - dstPaddingY : dstPaddingY;
 
-            _gd.HelperShader.BlitColor(
-                _gd,
-                cbs,
-                view,
-                _swapchainImageViews[nextImage],
-                _width,
-                _height,
-                1,
-                _format,
-                false,
-                new Extents2D(srcX0, srcY0, srcX1, srcY1),
-                new Extents2D(dstX0, dstY1, dstX1, dstY0),
-                true,
-                true);
+            if (_scalingFilter != null)
+            {
+                _scalingFilter.Run(
+                    view,
+                    cbs,
+                    _swapchainImageViews[nextImage],
+                    _format,
+                    _width,
+                    _height,
+                    new Extents2D(srcX0, srcY0, srcX1, srcY1),
+                    new Extents2D(dstX0, dstY0, dstX1, dstY1)
+                    );
+            }
+            else
+            {
+                _gd.HelperShader.BlitColor(
+                    _gd,
+                    cbs,
+                    view,
+                    _swapchainImageViews[nextImage],
+                    _width,
+                    _height,
+                    1,
+                    _format,
+                    false,
+                    new Extents2D(srcX0, srcY0, srcX1, srcY1),
+                    new Extents2D(dstX0, dstY1, dstX1, dstY0),
+                    _isLinear,
+                    true);
+            }
 
             Transition(
                 cbs.CommandBuffer,
@@ -387,6 +431,95 @@ namespace Ryujinx.Graphics.Vulkan
             }
         }
 
+        public override void SetAntiAliasing(AntiAliasing effect)
+        {
+            if (_currentAntiAliasing == effect && _effect != null)
+            {
+                return;
+            }
+
+            _currentAntiAliasing = effect;
+
+            _updateEffect = true;
+        }
+
+        public override void SetScalingFilter(ScalingFilter type)
+        {
+            if (_currentScalingFilter == type && _effect != null)
+            {
+                return;
+            }
+
+            _currentScalingFilter = type;
+
+            _updateScalingFilter = true;
+        }
+
+        private void UpdateEffect()
+        {
+            if (_updateEffect)
+            {
+                _updateEffect = false;
+
+                switch (_currentAntiAliasing)
+                {
+                    case AntiAliasing.Fxaa:
+                        _effect?.Dispose();
+                        _effect = new FxaaPostProcessingEffect(_gd, _device);
+                        break;
+                    case AntiAliasing.None:
+                        _effect?.Dispose();
+                        _effect = null;
+                        break;
+                    case AntiAliasing.SmaaLow:
+                    case AntiAliasing.SmaaMedium:
+                    case AntiAliasing.SmaaHigh:
+                    case AntiAliasing.SmaaUltra:
+                        var quality = _currentAntiAliasing - AntiAliasing.SmaaLow;
+                        if (_effect is SmaaPostProcessingEffect smaa)
+                        {
+                            smaa.Quality = quality;
+                        }
+                        else
+                        {
+                            _effect?.Dispose();
+                            _effect = new SmaaPostProcessingEffect(_gd, _device, quality);
+                        }
+                        break;
+                }
+            }
+
+            if (_updateScalingFilter)
+            {
+                _updateScalingFilter = false;
+
+                switch (_currentScalingFilter)
+                {
+                    case ScalingFilter.Bilinear:
+                    case ScalingFilter.Nearest:
+                        _scalingFilter?.Dispose();
+                        _scalingFilter = null;
+                        _isLinear = _currentScalingFilter == ScalingFilter.Bilinear;
+                        break;
+                    case ScalingFilter.Fsr:
+                        if (_scalingFilter is not FsrScalingFilter)
+                        {
+                            _scalingFilter?.Dispose();
+                            _scalingFilter = new FsrScalingFilter(_gd, _device);
+                        }
+
+                        _scalingFilter.Level = _scalingFilterLevel;
+                        break;
+                }
+            }
+        }
+
+        public override void SetScalingFilterLevel(float level)
+        {
+            _scalingFilterLevel = level;
+            _updateScalingFilter = true;
+        }
+
         private unsafe void Transition(
             CommandBuffer commandBuffer,
             Image image,
@@ -456,8 +589,10 @@ namespace Ryujinx.Graphics.Vulkan
                     }
 
                     _gd.SwapchainApi.DestroySwapchain(_device, _swapchain, null);
-
                 }
+
+                _effect?.Dispose();
+                _scalingFilter?.Dispose();
             }
         }
 
diff --git a/Ryujinx.Graphics.Vulkan/WindowBase.cs b/Ryujinx.Graphics.Vulkan/WindowBase.cs
index 651fe7c162..0a365e8fb6 100644
--- a/Ryujinx.Graphics.Vulkan/WindowBase.cs
+++ b/Ryujinx.Graphics.Vulkan/WindowBase.cs
@@ -11,5 +11,8 @@ namespace Ryujinx.Graphics.Vulkan
         public abstract void Present(ITexture texture, ImageCrop crop, Action swapBuffersCallback);
         public abstract void SetSize(int width, int height);
         public abstract void ChangeVSyncMode(bool vsyncEnabled);
+        public abstract void SetAntiAliasing(AntiAliasing effect);
+        public abstract void SetScalingFilter(ScalingFilter scalerType);
+        public abstract void SetScalingFilterLevel(float scale);
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs b/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs
index 226b5933b5..e9aec04b2f 100644
--- a/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs
+++ b/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs
@@ -14,7 +14,7 @@ namespace Ryujinx.Ui.Common.Configuration
         /// <summary>
         /// The current version of the file format
         /// </summary>
-        public const int CurrentVersion = 43;
+        public const int CurrentVersion = 44;
 
         /// <summary>
         /// Version of the configuration file format
@@ -51,6 +51,21 @@ namespace Ryujinx.Ui.Common.Configuration
         /// </summary>
         public AspectRatio AspectRatio { get; set; }
 
+        /// <summary>
+        /// Applies anti-aliasing to the renderer.
+        /// </summary>
+        public AntiAliasing AntiAliasing { get; set; }
+
+        /// <summary>
+        /// Sets the framebuffer upscaling type.
+        /// </summary>
+        public ScalingFilter ScalingFilter { get; set; }
+
+        /// <summary>
+        /// Sets the framebuffer upscaling level.
+        /// </summary>
+        public int ScalingFilterLevel { get; set; }
+
         /// <summary>
         /// Dumps shaders in this local directory
         /// </summary>
diff --git a/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs b/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs
index f193b1570f..bcdd2e70a6 100644
--- a/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs
+++ b/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs
@@ -433,6 +433,21 @@ namespace Ryujinx.Ui.Common.Configuration
             /// </summary>
             public ReactiveObject<GraphicsBackend> GraphicsBackend { get; private set; }
 
+            /// <summary>
+            /// Applies anti-aliasing to the renderer.
+            /// </summary>
+            public ReactiveObject<AntiAliasing> AntiAliasing { get; private set; }
+
+            /// <summary>
+            /// Sets the framebuffer upscaling type.
+            /// </summary>
+            public ReactiveObject<ScalingFilter> ScalingFilter { get; private set; }
+
+            /// <summary>
+            /// Sets the framebuffer upscaling level.
+            /// </summary>
+            public ReactiveObject<int> ScalingFilterLevel { get; private set; }
+
             /// <summary>
             /// Preferred GPU
             /// </summary>
@@ -463,6 +478,12 @@ namespace Ryujinx.Ui.Common.Configuration
                 PreferredGpu.Event               += static (sender, e) => LogValueChange(sender, e, nameof(PreferredGpu));
                 EnableMacroHLE                   = new ReactiveObject<bool>();
                 EnableMacroHLE.Event             += static (sender, e) => LogValueChange(sender, e, nameof(EnableMacroHLE));
+                AntiAliasing                     = new ReactiveObject<AntiAliasing>();
+                AntiAliasing.Event               += static (sender, e) => LogValueChange(sender, e, nameof(AntiAliasing));
+                ScalingFilter                    = new ReactiveObject<ScalingFilter>();
+                ScalingFilter.Event              += static (sender, e) => LogValueChange(sender, e, nameof(ScalingFilter));
+                ScalingFilterLevel               = new ReactiveObject<int>();
+                ScalingFilterLevel.Event         += static (sender, e) => LogValueChange(sender, e, nameof(ScalingFilterLevel));
             }
         }
 
@@ -540,6 +561,9 @@ namespace Ryujinx.Ui.Common.Configuration
                 ResScaleCustom             = Graphics.ResScaleCustom,
                 MaxAnisotropy              = Graphics.MaxAnisotropy,
                 AspectRatio                = Graphics.AspectRatio,
+                AntiAliasing               = Graphics.AntiAliasing,
+                ScalingFilter              = Graphics.ScalingFilter,
+                ScalingFilterLevel         = Graphics.ScalingFilterLevel,
                 GraphicsShadersDumpPath    = Graphics.ShadersDumpPath,
                 LoggingEnableDebug         = Logger.EnableDebug,
                 LoggingEnableStub          = Logger.EnableStub,
@@ -651,6 +675,9 @@ namespace Ryujinx.Ui.Common.Configuration
             Graphics.EnableShaderCache.Value          = true;
             Graphics.EnableTextureRecompression.Value = false;
             Graphics.EnableMacroHLE.Value             = true;
+            Graphics.AntiAliasing.Value               = AntiAliasing.None;
+            Graphics.ScalingFilter.Value              = ScalingFilter.Bilinear;
+            Graphics.ScalingFilterLevel.Value         = 80;
             System.EnablePtc.Value                    = true;
             System.EnableInternetAccess.Value         = false;
             System.EnableFsIntegrityChecks.Value      = true;
@@ -1208,6 +1235,17 @@ namespace Ryujinx.Ui.Common.Configuration
                 configurationFileFormat.UseHypervisor = true;
             }
 
+            if (configurationFileFormat.Version < 44)
+            {
+                Ryujinx.Common.Logging.Logger.Warning?.Print(LogClass.Application, $"Outdated configuration version {configurationFileFormat.Version}, migrating to version 42.");
+
+                configurationFileFormat.AntiAliasing = AntiAliasing.None;
+                configurationFileFormat.ScalingFilter = ScalingFilter.Bilinear;
+                configurationFileFormat.ScalingFilterLevel = 80;
+
+                configurationFileUpdated = true;
+            }
+
             Logger.EnableFileLog.Value                = configurationFileFormat.EnableFileLog;
             Graphics.ResScale.Value                   = configurationFileFormat.ResScale;
             Graphics.ResScaleCustom.Value             = configurationFileFormat.ResScaleCustom;
@@ -1217,6 +1255,9 @@ namespace Ryujinx.Ui.Common.Configuration
             Graphics.BackendThreading.Value           = configurationFileFormat.BackendThreading;
             Graphics.GraphicsBackend.Value            = configurationFileFormat.GraphicsBackend;
             Graphics.PreferredGpu.Value               = configurationFileFormat.PreferredGpu;
+            Graphics.AntiAliasing.Value               = configurationFileFormat.AntiAliasing;
+            Graphics.ScalingFilter.Value              = configurationFileFormat.ScalingFilter;
+            Graphics.ScalingFilterLevel.Value         = configurationFileFormat.ScalingFilterLevel;
             Logger.EnableDebug.Value                  = configurationFileFormat.LoggingEnableDebug;
             Logger.EnableStub.Value                   = configurationFileFormat.LoggingEnableStub;
             Logger.EnableInfo.Value                   = configurationFileFormat.LoggingEnableInfo;
diff --git a/Ryujinx/Ui/RendererWidgetBase.cs b/Ryujinx/Ui/RendererWidgetBase.cs
index 4bf2a70ff3..957bbcd55a 100644
--- a/Ryujinx/Ui/RendererWidgetBase.cs
+++ b/Ryujinx/Ui/RendererWidgetBase.cs
@@ -27,6 +27,7 @@ namespace Ryujinx.Ui
     using Image = SixLabors.ImageSharp.Image;
     using Key = Input.Key;
     using Switch = HLE.Switch;
+    using ScalingFilter = Graphics.GAL.ScalingFilter;
 
     public abstract class RendererWidgetBase : DrawingArea
     {
@@ -116,6 +117,21 @@ namespace Ryujinx.Ui
             _lastCursorMoveTime = Stopwatch.GetTimestamp();
 
             ConfigurationState.Instance.HideCursorOnIdle.Event += HideCursorStateChanged;
+            ConfigurationState.Instance.Graphics.AntiAliasing.Event += UpdateAnriAliasing;
+            ConfigurationState.Instance.Graphics.ScalingFilter.Event += UpdateScalingFilter;
+            ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event += UpdateScalingFilterLevel;
+        }
+
+        private void UpdateScalingFilterLevel(object sender, ReactiveEventArgs<int> e)
+        {
+            Renderer.Window.SetScalingFilter((ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value);
+            Renderer.Window.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value);
+        }
+
+        private void UpdateScalingFilter(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.ScalingFilter> e)
+        {
+            Renderer.Window.SetScalingFilter((ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value);
+            Renderer.Window.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value);
         }
 
         public abstract void InitializeRenderer();
@@ -149,11 +165,19 @@ namespace Ryujinx.Ui
         private void Renderer_Destroyed(object sender, EventArgs e)
         {
             ConfigurationState.Instance.HideCursorOnIdle.Event -= HideCursorStateChanged;
+            ConfigurationState.Instance.Graphics.AntiAliasing.Event -= UpdateAnriAliasing;
+            ConfigurationState.Instance.Graphics.ScalingFilter.Event -= UpdateScalingFilter;
+            ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event -= UpdateScalingFilterLevel;
 
             NpadManager.Dispose();
             Dispose();
         }
 
+        private void UpdateAnriAliasing(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.AntiAliasing> e)
+        {
+            Renderer?.Window.SetAntiAliasing((Graphics.GAL.AntiAliasing)e.NewValue);
+        }
+
         protected override bool OnMotionNotifyEvent(EventMotion evnt)
         {
             if (_hideCursorOnIdle)
@@ -394,6 +418,10 @@ namespace Ryujinx.Ui
 
             Device.Gpu.Renderer.Initialize(_glLogLevel);
 
+            Renderer.Window.SetAntiAliasing((Graphics.GAL.AntiAliasing)ConfigurationState.Instance.Graphics.AntiAliasing.Value);
+            Renderer.Window.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value);
+            Renderer.Window.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value);
+
             _gpuBackendName = GetGpuBackendName();
             _gpuVendorName = GetGpuVendorName();
 
diff --git a/Ryujinx/Ui/Windows/SettingsWindow.cs b/Ryujinx/Ui/Windows/SettingsWindow.cs
index 220bb82aef..61af7d3977 100644
--- a/Ryujinx/Ui/Windows/SettingsWindow.cs
+++ b/Ryujinx/Ui/Windows/SettingsWindow.cs
@@ -95,10 +95,14 @@ namespace Ryujinx.Ui.Windows
         [GUI] Entry           _graphicsShadersDumpPath;
         [GUI] ComboBoxText    _anisotropy;
         [GUI] ComboBoxText    _aspectRatio;
+        [GUI] ComboBoxText    _antiAliasing;
+        [GUI] ComboBoxText    _scalingFilter;
         [GUI] ComboBoxText    _graphicsBackend;
         [GUI] ComboBoxText    _preferredGpu;
         [GUI] ComboBoxText    _resScaleCombo;
         [GUI] Entry           _resScaleText;
+        [GUI] Adjustment      _scalingFilterLevel;
+        [GUI] Scale           _scalingFilterSlider;
         [GUI] ToggleButton    _configureController1;
         [GUI] ToggleButton    _configureController2;
         [GUI] ToggleButton    _configureController3;
@@ -139,6 +143,7 @@ namespace Ryujinx.Ui.Windows
             _systemTimeZoneEntry.FocusOutEvent += TimeZoneEntry_FocusOut;
 
             _resScaleCombo.Changed += (sender, args) => _resScaleText.Visible = _resScaleCombo.ActiveId == "-1";
+            _scalingFilter.Changed += (sender, args) => _scalingFilterSlider.Visible = _scalingFilter.ActiveId == "2";
             _galThreading.Changed += (sender, args) =>
             {
                 if (_galThreading.ActiveId != ConfigurationState.Instance.Graphics.BackendThreading.Value.ToString())
@@ -338,6 +343,8 @@ namespace Ryujinx.Ui.Windows
             _anisotropy.SetActiveId(ConfigurationState.Instance.Graphics.MaxAnisotropy.Value.ToString());
             _aspectRatio.SetActiveId(((int)ConfigurationState.Instance.Graphics.AspectRatio.Value).ToString());
             _graphicsBackend.SetActiveId(((int)ConfigurationState.Instance.Graphics.GraphicsBackend.Value).ToString());
+            _antiAliasing.SetActiveId(((int)ConfigurationState.Instance.Graphics.AntiAliasing.Value).ToString());
+            _scalingFilter.SetActiveId(((int)ConfigurationState.Instance.Graphics.ScalingFilter.Value).ToString());
 
             UpdatePreferredGpuComboBox();
 
@@ -345,7 +352,9 @@ namespace Ryujinx.Ui.Windows
 
             _custThemePath.Buffer.Text           = ConfigurationState.Instance.Ui.CustomThemePath;
             _resScaleText.Buffer.Text            = ConfigurationState.Instance.Graphics.ResScaleCustom.Value.ToString();
+            _scalingFilterLevel.Value            = ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value;
             _resScaleText.Visible                = _resScaleCombo.ActiveId == "-1";
+            _scalingFilterSlider.Visible         = _scalingFilter.ActiveId == "2";
             _graphicsShadersDumpPath.Buffer.Text = ConfigurationState.Instance.Graphics.ShadersDumpPath;
             _fsLogSpinAdjustment.Value           = ConfigurationState.Instance.System.FsGlobalAccessLogMode;
             _systemTimeOffset                    = ConfigurationState.Instance.System.SystemTimeOffset;
@@ -605,6 +614,9 @@ namespace Ryujinx.Ui.Windows
             ConfigurationState.Instance.Graphics.ResScale.Value                   = int.Parse(_resScaleCombo.ActiveId);
             ConfigurationState.Instance.Graphics.ResScaleCustom.Value             = resScaleCustom;
             ConfigurationState.Instance.System.AudioVolume.Value                  = (float)_audioVolumeSlider.Value / 100.0f;
+            ConfigurationState.Instance.Graphics.AntiAliasing.Value               = Enum.Parse<AntiAliasing>(_antiAliasing.ActiveId);
+            ConfigurationState.Instance.Graphics.ScalingFilter.Value              = Enum.Parse<ScalingFilter>(_scalingFilter.ActiveId);
+            ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value         = (int)_scalingFilterLevel.Value;
 
             _previousVolumeLevel = ConfigurationState.Instance.System.AudioVolume.Value;
 
diff --git a/Ryujinx/Ui/Windows/SettingsWindow.glade b/Ryujinx/Ui/Windows/SettingsWindow.glade
index e39be81a94..c19c1db9fa 100644
--- a/Ryujinx/Ui/Windows/SettingsWindow.glade
+++ b/Ryujinx/Ui/Windows/SettingsWindow.glade
@@ -40,6 +40,13 @@
     <property name="inline-completion">True</property>
     <property name="inline-selection">True</property>
   </object>
+  <object class="GtkAdjustment" id="_scalingFilterLevel">
+    <property name="lower">0</property>
+    <property name="upper">101</property>
+    <property name="step-increment">1</property>
+    <property name="page-increment">5</property>
+    <property name="page-size">1</property>
+  </object>
   <object class="GtkWindow" id="_settingsWin">
     <property name="can-focus">False</property>
     <property name="title" translatable="yes">Ryujinx - Settings</property>
@@ -2152,6 +2159,118 @@
                                     <property name="position">3</property>
                                   </packing>
                                 </child>
+                                <child>
+                                  <object class="GtkBox">
+                                    <property name="visible">True</property>
+                                    <property name="can-focus">False</property>
+                                    <property name="margin-top">5</property>
+                                    <property name="margin-bottom">5</property>
+                                    <child>
+                                      <object class="GtkLabel">
+                                        <property name="visible">True</property>
+                                        <property name="can-focus">False</property>
+                                        <property name="tooltip-text" translatable="yes">Applies a final effect to the game render</property>
+                                        <property name="label" translatable="yes">Post Processing Effect:</property>
+                                      </object>
+                                      <packing>
+                                        <property name="expand">False</property>
+                                        <property name="fill">True</property>
+                                        <property name="padding">5</property>
+                                        <property name="position">0</property>
+                                      </packing>
+                                    </child>
+                                    <child>
+                                      <object class="GtkComboBoxText" id="_antiAliasing">
+                                        <property name="visible">True</property>
+                                        <property name="can-focus">False</property>
+                                        <property name="tooltip-text" translatable="yes">Applies anti-aliasing to the game render</property>
+                                        <property name="active-id">1</property>
+                                        <items>
+                                          <item id="0" translatable="yes">None</item>
+                                          <item id="1" translatable="yes">FXAA</item>
+                                          <item id="2" translatable="yes">SMAA Low</item>
+                                          <item id="3" translatable="yes">SMAA Medium</item>
+                                          <item id="4" translatable="yes">SMAA High</item>
+                                          <item id="5" translatable="yes">SMAA Ultra</item>
+                                        </items>
+                                      </object>
+                                      <packing>
+                                        <property name="expand">False</property>
+                                        <property name="fill">True</property>
+                                        <property name="position">1</property>
+                                      </packing>
+                                    </child>
+                                  </object>
+                                  <packing>
+                                    <property name="expand">False</property>
+                                    <property name="fill">True</property>
+                                    <property name="padding">5</property>
+                                    <property name="position">4</property>
+                                  </packing>
+                                </child>
+                                <child>
+                                  <object class="GtkBox">
+                                    <property name="width-request">100</property>
+                                    <property name="visible">True</property>
+                                    <property name="can-focus">False</property>
+                                    <property name="margin-top">5</property>
+                                    <property name="margin-bottom">5</property>
+                                    <child>
+                                      <object class="GtkLabel">
+                                        <property name="visible">True</property>
+                                        <property name="can-focus">False</property>
+                                        <property name="tooltip-text" translatable="yes">Enables Framebuffer Upscaling</property>
+                                        <property name="label" translatable="yes">Upscale: </property>
+                                      </object>
+                                      <packing>
+                                        <property name="expand">False</property>
+                                        <property name="fill">True</property>
+                                        <property name="padding">5</property>
+                                        <property name="position">0</property>
+                                      </packing>
+                                    </child>
+                                    <child>
+                                      <object class="GtkComboBoxText" id="_scalingFilter">
+                                        <property name="visible">True</property>
+                                        <property name="can-focus">False</property>
+                                        <property name="tooltip-text" translatable="yes">Enables Framebuffer Upscaling</property>
+                                        <property name="active-id">1</property>
+                                        <items>
+                                          <item id="0" translatable="yes">Bilinear</item>
+                                          <item id="1" translatable="yes">Nearest</item>
+                                          <item id="2" translatable="yes">FSR</item>
+                                        </items>
+                                      </object>
+                                      <packing>
+                                        <property name="expand">False</property>
+                                        <property name="fill">True</property>
+                                        <property name="position">1</property>
+                                      </packing>
+                                    </child>
+                                    <child>
+                                      <object class="GtkScale" id="_scalingFilterSlider">
+                                        <property name="width-request">200</property>
+                                        <property name="visible">True</property>
+                                        <property name="can-focus">True</property>
+                                        <property name="margin-start">5</property>
+                                        <property name="adjustment">_scalingFilterLevel</property>
+                                        <property name="round-digits">1</property>
+                                        <property name="value-pos">right</property>
+                                      </object>
+                                      <packing>
+                                        <property name="expand">False</property>
+                                        <property name="fill">True</property>
+                                        <property name="position">3</property>
+                                      </packing>
+                                    </child>
+                                  </object>
+                                  <packing>
+                                    <property name="expand">False</property>
+                                    <property name="fill">True</property>
+                                    <property name="padding">5</property>
+                                    <property name="position">5</property>
+                                  </packing>
+                                </child>
                                 <child>
                                   <object class="GtkBox">
                                     <property name="visible">True</property>
@@ -2197,7 +2316,7 @@
                                     <property name="expand">False</property>
                                     <property name="fill">True</property>
                                     <property name="padding">5</property>
-                                    <property name="position">4</property>
+                                    <property name="position">6</property>
                                   </packing>
                                 </child>
                                 <child>
@@ -2246,7 +2365,7 @@
                                     <property name="expand">False</property>
                                     <property name="fill">True</property>
                                     <property name="padding">5</property>
-                                    <property name="position">5</property>
+                                    <property name="position">7</property>
                                   </packing>
                                 </child>
                               </object>