diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs
index 9640447be8..246722f812 100644
--- a/Ryujinx.Graphics.GAL/Capabilities.cs
+++ b/Ryujinx.Graphics.GAL/Capabilities.cs
@@ -4,9 +4,14 @@ namespace Ryujinx.Graphics.GAL
     {
         public bool SupportsAstcCompression { get; }
 
-        public Capabilities(bool supportsAstcCompression)
+        public int StorageBufferOffsetAlignment { get; }
+
+        public Capabilities(
+            bool supportsAstcCompression,
+            int  storageBufferOffsetAlignment)
         {
-            SupportsAstcCompression = supportsAstcCompression;
+            SupportsAstcCompression      = supportsAstcCompression;
+            StorageBufferOffsetAlignment = storageBufferOffsetAlignment;
         }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/GpuContext.cs b/Ryujinx.Graphics.Gpu/GpuContext.cs
index 51961522a0..0906d10ee2 100644
--- a/Ryujinx.Graphics.Gpu/GpuContext.cs
+++ b/Ryujinx.Graphics.Gpu/GpuContext.cs
@@ -45,7 +45,7 @@ namespace Ryujinx.Graphics.Gpu
 
             Window = new Window(this);
 
-            _caps = new Lazy<Capabilities>(GetCapabilities);
+            _caps = new Lazy<Capabilities>(Renderer.GetCapabilities);
         }
 
         internal void AdvanceSequence()
@@ -53,11 +53,6 @@ namespace Ryujinx.Graphics.Gpu
             SequenceNumber++;
         }
 
-        private Capabilities GetCapabilities()
-        {
-            return Renderer.GetCapabilities();
-        }
-
         public void SetVmm(IPhysicalMemory mm)
         {
             PhysicalMemory = mm;
diff --git a/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs b/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs
index a066585ca7..83ca5db531 100644
--- a/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs
+++ b/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs
@@ -1,3 +1,4 @@
+using Ryujinx.Common;
 using Ryujinx.Graphics.GAL;
 using Ryujinx.Graphics.GAL.InputAssembler;
 using Ryujinx.Graphics.Gpu.State;
@@ -113,10 +114,9 @@ namespace Ryujinx.Graphics.Gpu.Memory
 
         public void SetComputeStorageBuffer(int index, ulong gpuVa, ulong size)
         {
-            // TODO: Improve
-            size += gpuVa & 0x3fUL;
+            size += gpuVa & ((ulong)_context.Capabilities.StorageBufferOffsetAlignment - 1);
 
-            gpuVa &= ~0x3fUL;
+            gpuVa = BitUtils.AlignDown(gpuVa, _context.Capabilities.StorageBufferOffsetAlignment);
 
             ulong address = TranslateAndCreateBuffer(gpuVa, size);
 
@@ -125,10 +125,9 @@ namespace Ryujinx.Graphics.Gpu.Memory
 
         public void SetGraphicsStorageBuffer(int stage, int index, ulong gpuVa, ulong size)
         {
-            // TODO: Improve
-            size += gpuVa & 0x3fUL;
+            size += gpuVa & ((ulong)_context.Capabilities.StorageBufferOffsetAlignment - 1);
 
-            gpuVa &= ~0x3fUL;
+            gpuVa = BitUtils.AlignDown(gpuVa, _context.Capabilities.StorageBufferOffsetAlignment);
 
             ulong address = TranslateAndCreateBuffer(gpuVa, size);
 
diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs
index a781de42ad..8e39662d1e 100644
--- a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs
+++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs
@@ -199,7 +199,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
 
             Span<byte> code = _context.MemoryAccessor.Read(gpuVa, MaxProgramSize);
 
-            program = Translator.Translate(code, flags);
+            program = Translator.Translate(code, GetShaderCapabilities(), flags);
 
             int[] codeCached = MemoryMarshal.Cast<byte, int>(code.Slice(0, program.Size)).ToArray();
 
@@ -238,7 +238,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
                 Span<byte> codeA = _context.MemoryAccessor.Read(gpuVaA, MaxProgramSize);
                 Span<byte> codeB = _context.MemoryAccessor.Read(gpuVa,  MaxProgramSize);
 
-                program = Translator.Translate(codeA, codeB, flags);
+                program = Translator.Translate(codeA, codeB, GetShaderCapabilities(), flags);
 
                 // TODO: We should also check "codeA" into account.
                 codeCached = MemoryMarshal.Cast<byte, int>(codeB.Slice(0, program.Size)).ToArray();
@@ -258,7 +258,7 @@ namespace Ryujinx.Graphics.Gpu.Shader
             {
                 Span<byte> code = _context.MemoryAccessor.Read(gpuVa, MaxProgramSize);
 
-                program = Translator.Translate(code, flags);
+                program = Translator.Translate(code, GetShaderCapabilities(), flags);
 
                 codeCached = MemoryMarshal.Cast<byte, int>(code.Slice(0, program.Size)).ToArray();
 
@@ -342,5 +342,10 @@ namespace Ryujinx.Graphics.Gpu.Shader
                 isFirst = false;
             }
         }
+
+        private ShaderCapabilities GetShaderCapabilities()
+        {
+            return new ShaderCapabilities(_context.Capabilities.StorageBufferOffsetAlignment);
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/HwCapabilities.cs b/Ryujinx.Graphics.OpenGL/HwCapabilities.cs
index f958946e7c..70112a3a92 100644
--- a/Ryujinx.Graphics.OpenGL/HwCapabilities.cs
+++ b/Ryujinx.Graphics.OpenGL/HwCapabilities.cs
@@ -5,9 +5,13 @@ namespace Ryujinx.Graphics.OpenGL
 {
     static class HwCapabilities
     {
-        private static Lazy<bool> _astcCompression = new Lazy<bool>(() => HasExtension("GL_KHR_texture_compression_astc_ldr"));
+        private static Lazy<bool> _supportsAstcCompression = new Lazy<bool>(() => HasExtension("GL_KHR_texture_compression_astc_ldr"));
 
-        public static bool SupportsAstcCompression => _astcCompression.Value;
+        private static Lazy<int> _storageBufferOffsetAlignment = new Lazy<int>(() => GetLimit(All.ShaderStorageBufferOffsetAlignment));
+
+        public static bool SupportsAstcCompression => _supportsAstcCompression.Value;
+
+        public static int StorageBufferOffsetAlignment => _storageBufferOffsetAlignment.Value;
 
         private static bool HasExtension(string name)
         {
@@ -23,5 +27,10 @@ namespace Ryujinx.Graphics.OpenGL
 
             return false;
         }
+
+        private static int GetLimit(All name)
+        {
+            return GL.GetInteger((GetPName)name);
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.OpenGL/Renderer.cs b/Ryujinx.Graphics.OpenGL/Renderer.cs
index 1baee04b12..c320d1504b 100644
--- a/Ryujinx.Graphics.OpenGL/Renderer.cs
+++ b/Ryujinx.Graphics.OpenGL/Renderer.cs
@@ -61,7 +61,9 @@ namespace Ryujinx.Graphics.OpenGL
 
         public Capabilities GetCapabilities()
         {
-            return new Capabilities(HwCapabilities.SupportsAstcCompression);
+            return new Capabilities(
+                HwCapabilities.SupportsAstcCompression,
+                HwCapabilities.StorageBufferOffsetAlignment);
         }
 
         public ulong GetCounter(CounterType type)
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs
index e8b4496121..a5c8cc9a9d 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs
@@ -86,7 +86,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
 
             if (info.SBuffers.Count != 0)
             {
-                DeclareUsedStorage(context, info);
+                DeclareStorages(context, info);
 
                 context.AppendLine();
             }
@@ -176,11 +176,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
 
                 context.AppendLine(GetVarTypeName(decl.VarType) + " " + name + ";");
             }
-
-            if ((info.HelperFunctionsMask & HelperFunctionsMask.GlobalMemory) != 0)
-            {
-                context.AppendLine($"ivec2 {DefaultNames.GmemOffsetName};");
-            }
         }
 
         private static string GetVarTypeName(VariableType type)
@@ -218,31 +213,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
             }
         }
 
-        private static void DeclareAllStorage(CodeGenContext context, StructuredProgramInfo info)
-        {
-            string sbName = OperandManager.GetShaderStagePrefix(context.Config.Stage);
-
-            sbName += "_" + DefaultNames.StorageNamePrefix;
-
-            string blockName = $"{sbName}_{DefaultNames.BlockSuffix}";
-
-            context.AppendLine("layout (std430) buffer " + blockName);
-
-            context.EnterScope();
-
-            context.AppendLine("uint " + DefaultNames.DataName + "[];");
-
-            string arraySize = NumberFormatter.FormatInt(Constants.MaxShaderStorageBuffers);
-
-            context.LeaveScope($" {sbName}[{arraySize}];");
-
-            for (int sbufSlot = 0; sbufSlot < Constants.MaxShaderStorageBuffers; sbufSlot++)
-            {
-                context.SBufferDescriptors.Add(new BufferDescriptor($"{blockName}[{sbufSlot}]", sbufSlot));
-            }
-        }
-
-        private static void DeclareUsedStorage(CodeGenContext context, StructuredProgramInfo info)
+        private static void DeclareStorages(CodeGenContext context, StructuredProgramInfo info)
         {
             string sbName = OperandManager.GetShaderStagePrefix(context.Config.Stage);
 
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/DefaultNames.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/DefaultNames.cs
index f1abc94956..4da38b2de5 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/DefaultNames.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/DefaultNames.cs
@@ -22,8 +22,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
         public const string LocalMemoryName  = "local_mem";
         public const string SharedMemoryName = "shared_mem";
 
-        public const string GmemOffsetName  = "gmemOffset";
-
         public const string UndefinedName = "undef";
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/GlobalMemory.glsl b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/GlobalMemory.glsl
deleted file mode 100644
index b8544ae23f..0000000000
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/GlobalMemory.glsl
+++ /dev/null
@@ -1,18 +0,0 @@
-ivec2 Helper_GetStorageBuffer(uint aLow, uint aHigh)
-{
-    uint64_t address = packUint2x32(uvec2(aLow, aHigh));
-    int i;
-    for (i = 0; i < 16; i++)
-    {
-        int offset = 0x40 + i * 4;
-        uint baseLow  = fp_c0_data[offset];
-        uint baseHigh = fp_c0_data[offset + 1];
-        uint size     = fp_c0_data[offset + 2];
-        uint64_t baseAddr = packUint2x32(uvec2(baseLow, baseHigh));
-        if (address >= baseAddr && address < baseAddr + packUint2x32(uvec2(size, 0)))
-        {
-            return ivec2(i, int(unpackUint2x32(address - (baseAddr & ~63ul)).x) >> 2);
-        }
-    }
-    return ivec2(0);
-}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs
index 302b56addb..f1540fbfb1 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs
@@ -2,8 +2,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl
 {
     static class HelperFunctionNames
     {
-        public static string GetStorageBuffer = "Helper_GetStorageBuffer";
-
         public static string Shuffle     = "Helper_Shuffle";
         public static string ShuffleDown = "Helper_ShuffleDown";
         public static string ShuffleUp   = "Helper_ShuffleUp";
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs
index b5cab54e38..b6cdd7f601 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs
@@ -49,12 +49,18 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
 
                     if (argIndex == 0 && atomic)
                     {
-                        switch (inst & Instruction.MrMask)
+                        Instruction memRegion = inst & Instruction.MrMask;
+
+                        switch (memRegion)
                         {
-                            // TODO: Global.
                             case Instruction.MrShared:  args += LoadShared (context, operation); break;
                             case Instruction.MrStorage: args += LoadStorage(context, operation); break;
+
+                            default: throw new InvalidOperationException($"Invalid memory region \"{memRegion}\".");
                         }
+
+                        // We use the first 2 operands above.
+                        argIndex++;
                     }
                     else
                     {
@@ -150,8 +156,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
                 }
             }
 
-            return "0";
-
             throw new InvalidOperationException($"Unexpected instruction type \"{info.Type}\".");
         }
     }
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
index c535d8fcfb..5c2ea85e6e 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenMemory.cs
@@ -119,19 +119,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
             return OperandManager.GetConstantBufferName(src1, offsetExpr, context.Config.Stage);
         }
 
-        public static string LoadGlobal(CodeGenContext context, AstOperation operation)
-        {
-            IAstNode src1 = operation.GetSource(0);
-            IAstNode src2 = operation.GetSource(1);
-
-            string addrLowExpr  = GetSoureExpr(context, src1, GetSrcVarType(operation.Inst, 0));
-            string addrHighExpr = GetSoureExpr(context, src2, GetSrcVarType(operation.Inst, 1));
-
-            context.AppendLine($"{DefaultNames.GmemOffsetName} = {HelperFunctionNames.GetStorageBuffer}({addrLowExpr}, {addrHighExpr});");
-
-            return GetStorageBufferAccessor($"{DefaultNames.GmemOffsetName}.x", $"{DefaultNames.GmemOffsetName}.y", context.Config.Stage);
-        }
-
         public static string LoadLocal(CodeGenContext context, AstOperation operation)
         {
             return LoadLocalOrShared(context, operation, DefaultNames.LocalMemoryName);
@@ -152,29 +139,14 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
         }
 
         public static string LoadStorage(CodeGenContext context, AstOperation operation)
-        {
-            IAstNode src1 = operation.GetSource(0);
-
-            string offsetExpr = GetSoureExpr(context, src1, GetSrcVarType(operation.Inst, 0));
-
-            return GetStorageBufferAccessor(operation.Index, offsetExpr, context.Config.Stage);
-        }
-
-        public static string StoreGlobal(CodeGenContext context, AstOperation operation)
         {
             IAstNode src1 = operation.GetSource(0);
             IAstNode src2 = operation.GetSource(1);
-            IAstNode src3 = operation.GetSource(2);
 
-            string addrLowExpr  = GetSoureExpr(context, src1, GetSrcVarType(operation.Inst, 0));
-            string addrHighExpr = GetSoureExpr(context, src2, GetSrcVarType(operation.Inst, 1));
-            string valueExpr    = GetSoureExpr(context, src3, GetSrcVarType(operation.Inst, 2));
+            string indexExpr  = GetSoureExpr(context, src1, GetSrcVarType(operation.Inst, 0));
+            string offsetExpr = GetSoureExpr(context, src2, GetSrcVarType(operation.Inst, 1));
 
-            context.AppendLine($"{DefaultNames.GmemOffsetName} = {HelperFunctionNames.GetStorageBuffer}({addrLowExpr}, {addrHighExpr});");
-
-            string sb = GetStorageBufferAccessor($"{DefaultNames.GmemOffsetName}.x", $"{DefaultNames.GmemOffsetName}.y", context.Config.Stage);
-
-            return $"{sb} = {valueExpr}";
+            return GetStorageBufferAccessor(indexExpr, offsetExpr, context.Config.Stage);
         }
 
         public static string StoreLocal(CodeGenContext context, AstOperation operation)
@@ -205,14 +177,16 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
         {
             IAstNode src1 = operation.GetSource(0);
             IAstNode src2 = operation.GetSource(1);
+            IAstNode src3 = operation.GetSource(2);
 
-            string offsetExpr = GetSoureExpr(context, src1, GetSrcVarType(operation.Inst, 0));
+            string indexExpr  = GetSoureExpr(context, src1, GetSrcVarType(operation.Inst, 0));
+            string offsetExpr = GetSoureExpr(context, src2, GetSrcVarType(operation.Inst, 1));
 
-            VariableType srcType = OperandManager.GetNodeDestType(src2);
+            VariableType srcType = OperandManager.GetNodeDestType(src3);
 
-            string src = TypeConversion.ReinterpretCast(context, src2, srcType, VariableType.U32);
+            string src = TypeConversion.ReinterpretCast(context, src3, srcType, VariableType.U32);
 
-            string sb = GetStorageBufferAccessor(operation.Index, offsetExpr, context.Config.Stage);
+            string sb = GetStorageBufferAccessor(indexExpr, offsetExpr, context.Config.Stage);
 
             return $"{sb} = {src}";
         }
@@ -489,27 +463,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
             return $"{sbName}[{slotExpr}].{DefaultNames.DataName}[{offsetExpr}]";
         }
 
-        private static string GetStorageBufferAccessor(int slot, string offsetExpr, ShaderStage stage)
-        {
-            string sbName = OperandManager.GetShaderStagePrefix(stage);
-
-            sbName += "_" + DefaultNames.StorageNamePrefix;
-
-            string mask = NumberFormatter.FormatUint(~(64u - 1));
-
-            // Subtract the base address of the global memory, to get the
-            // storage buffer offset. The mask is used to keep the lower bits,
-            // since the bound storage buffer must match the host alignment
-            // restrictions.
-            int ubOffset = GlobalToStorage.GetStorageCbOffset(stage, slot);
-
-            string ubName = OperandManager.GetConstantBufferName(0, ubOffset, stage);
-
-            offsetExpr = $"{offsetExpr} - int((floatBitsToUint({ubName}) & {mask}) >> 2)";
-
-            return $"{sbName}[{NumberFormatter.FormatInt(slot)}].{DefaultNames.DataName}[{offsetExpr}]";
-        }
-
         private static string GetMask(int index)
         {
             return '.' + "rgba".Substring(index, 1);
diff --git a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstType.cs b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstType.cs
index 5836e981f0..84e36cdd62 100644
--- a/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstType.cs
+++ b/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstType.cs
@@ -11,15 +11,17 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions
         OpBinaryCom = Op | 2 | Commutative,
         OpTernary   = Op | 3,
 
-        AtomicBinary  = CallBinary  | Atomic,
-        AtomicTernary = CallTernary | Atomic,
-
         CallNullary    = Call | 0,
         CallUnary      = Call | 1,
         CallBinary     = Call | 2,
         CallTernary    = Call | 3,
         CallQuaternary = Call | 4,
 
+        // The atomic instructions have one extra operand,
+        // for the storage slot and offset pair.
+        AtomicBinary  = Call | Atomic | 3,
+        AtomicTernary = Call | Atomic | 4,
+
         Commutative = 1 << 8,
         Op          = 1 << 9,
         Call        = 1 << 10,
diff --git a/Ryujinx.Graphics.Shader/Decoders/IntegerSize.cs b/Ryujinx.Graphics.Shader/Decoders/IntegerSize.cs
index 3ff8e1b261..d39c2a9091 100644
--- a/Ryujinx.Graphics.Shader/Decoders/IntegerSize.cs
+++ b/Ryujinx.Graphics.Shader/Decoders/IntegerSize.cs
@@ -2,12 +2,13 @@ namespace Ryujinx.Graphics.Shader.Decoders
 {
     enum IntegerSize
     {
-        U8   = 0,
-        S8   = 1,
-        U16  = 2,
-        S16  = 3,
-        B32  = 4,
-        B64  = 5,
-        B128 = 6
+        U8    = 0,
+        S8    = 1,
+        U16   = 2,
+        S16   = 3,
+        B32   = 4,
+        B64   = 5,
+        B128  = 6,
+        UB128 = 7
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Instructions/InstEmitMemory.cs b/Ryujinx.Graphics.Shader/Instructions/InstEmitMemory.cs
index 2abbed085f..56688161c1 100644
--- a/Ryujinx.Graphics.Shader/Instructions/InstEmitMemory.cs
+++ b/Ryujinx.Graphics.Shader/Instructions/InstEmitMemory.cs
@@ -11,7 +11,6 @@ namespace Ryujinx.Graphics.Shader.Instructions
     {
         private enum MemoryRegion
         {
-            Global,
             Local,
             Shared
         }
@@ -60,13 +59,20 @@ namespace Ryujinx.Graphics.Shader.Instructions
         {
             OpCodeAtom op = (OpCodeAtom)context.CurrOp;
 
-            Operand mem = context.ShiftRightU32(GetSrcA(context), Const(2));
+            Operand offset = context.ShiftRightU32(GetSrcA(context), Const(2));
 
-            mem = context.IAdd(mem, Const(op.Offset));
+            offset = context.IAdd(offset, Const(op.Offset));
 
             Operand value = GetSrcB(context);
 
-            Operand res = EmitAtomicOp(context, Instruction.MrShared, op.AtomicOp, op.Type, mem, value);
+            Operand res = EmitAtomicOp(
+                context,
+                Instruction.MrShared,
+                op.AtomicOp,
+                op.Type,
+                offset,
+                Const(0),
+                value);
 
             context.Copy(GetDest(context), res);
         }
@@ -148,7 +154,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
         public static void Ldg(EmitterContext context)
         {
-            EmitLoad(context, MemoryRegion.Global);
+            EmitLoadGlobal(context);
         }
 
         public static void Lds(EmitterContext context)
@@ -183,11 +189,16 @@ namespace Ryujinx.Graphics.Shader.Instructions
         {
             OpCodeRed op = (OpCodeRed)context.CurrOp;
 
-            Operand offset = context.IAdd(GetSrcA(context), Const(op.Offset));
+            (Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, op.Ra, op.Extended, op.Offset);
 
-            Operand mem = context.ShiftRightU32(offset, Const(2));
-
-            EmitAtomicOp(context, Instruction.MrGlobal, op.AtomicOp, op.Type, mem, GetDest(context));
+            EmitAtomicOp(
+                context,
+                Instruction.MrGlobal,
+                op.AtomicOp,
+                op.Type,
+                addrLow,
+                addrHigh,
+                GetDest(context));
         }
 
         public static void St(EmitterContext context)
@@ -197,7 +208,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
         public static void Stg(EmitterContext context)
         {
-            EmitStore(context, MemoryRegion.Global);
+            EmitStoreGlobal(context);
         }
 
         public static void Sts(EmitterContext context)
@@ -210,7 +221,8 @@ namespace Ryujinx.Graphics.Shader.Instructions
             Instruction    mr,
             AtomicOp       op,
             ReductionType  type,
-            Operand        mem,
+            Operand        addrLow,
+            Operand        addrHigh,
             Operand        value)
         {
             Operand res = Const(0);
@@ -220,7 +232,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
                 case AtomicOp.Add:
                     if (type == ReductionType.S32 || type == ReductionType.U32)
                     {
-                        res = context.AtomicAdd(mr, mem, value);
+                        res = context.AtomicAdd(mr, addrLow, addrHigh, value);
                     }
                     else
                     {
@@ -230,7 +242,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
                 case AtomicOp.BitwiseAnd:
                     if (type == ReductionType.S32 || type == ReductionType.U32)
                     {
-                        res = context.AtomicAnd(mr, mem, value);
+                        res = context.AtomicAnd(mr, addrLow, addrHigh, value);
                     }
                     else
                     {
@@ -240,7 +252,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
                 case AtomicOp.BitwiseExclusiveOr:
                     if (type == ReductionType.S32 || type == ReductionType.U32)
                     {
-                        res = context.AtomicXor(mr, mem, value);
+                        res = context.AtomicXor(mr, addrLow, addrHigh, value);
                     }
                     else
                     {
@@ -250,7 +262,7 @@ namespace Ryujinx.Graphics.Shader.Instructions
                 case AtomicOp.BitwiseOr:
                     if (type == ReductionType.S32 || type == ReductionType.U32)
                     {
-                        res = context.AtomicOr(mr, mem, value);
+                        res = context.AtomicOr(mr, addrLow, addrHigh, value);
                     }
                     else
                     {
@@ -260,11 +272,11 @@ namespace Ryujinx.Graphics.Shader.Instructions
                 case AtomicOp.Maximum:
                     if (type == ReductionType.S32)
                     {
-                        res = context.AtomicMaxS32(mr, mem, value);
+                        res = context.AtomicMaxS32(mr, addrLow, addrHigh, value);
                     }
                     else if (type == ReductionType.U32)
                     {
-                        res = context.AtomicMaxU32(mr, mem, value);
+                        res = context.AtomicMaxU32(mr, addrLow, addrHigh, value);
                     }
                     else
                     {
@@ -274,11 +286,11 @@ namespace Ryujinx.Graphics.Shader.Instructions
                 case AtomicOp.Minimum:
                     if (type == ReductionType.S32)
                     {
-                        res = context.AtomicMinS32(mr, mem, value);
+                        res = context.AtomicMinS32(mr, addrLow, addrHigh, value);
                     }
                     else if (type == ReductionType.U32)
                     {
-                        res = context.AtomicMinU32(mr, mem, value);
+                        res = context.AtomicMinU32(mr, addrLow, addrHigh, value);
                     }
                     else
                     {
@@ -331,7 +343,6 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
                 switch (region)
                 {
-                    case MemoryRegion.Global: value = context.LoadGlobal(offset); break;
                     case MemoryRegion.Local:  value = context.LoadLocal (offset); break;
                     case MemoryRegion.Shared: value = context.LoadShared(offset); break;
                 }
@@ -345,6 +356,38 @@ namespace Ryujinx.Graphics.Shader.Instructions
             }
         }
 
+        private static void EmitLoadGlobal(EmitterContext context)
+        {
+            OpCodeMemory op = (OpCodeMemory)context.CurrOp;
+
+            bool isSmallInt = op.Size < IntegerSize.B32;
+
+            int count = GetVectorCount(op.Size);
+
+            (Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, op.Ra, op.Extended, op.Offset);
+
+            Operand bitOffset = GetBitOffset(context, addrLow);
+
+            for (int index = 0; index < count; index++)
+            {
+                Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
+
+                if (rd.IsRZ)
+                {
+                    break;
+                }
+
+                Operand value = context.LoadGlobal(context.IAdd(addrLow, Const(index * 4)), addrHigh);
+
+                if (isSmallInt)
+                {
+                    value = ExtractSmallInt(context, op.Size, bitOffset, value);
+                }
+
+                context.Copy(Register(rd), value);
+            }
+        }
+
         private static void EmitStore(EmitterContext context, MemoryRegion region)
         {
             OpCodeMemory op = (OpCodeMemory)context.CurrOp;
@@ -384,7 +427,6 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
                     switch (region)
                     {
-                        case MemoryRegion.Global: word = context.LoadGlobal(offset); break;
                         case MemoryRegion.Local:  word = context.LoadLocal (offset); break;
                         case MemoryRegion.Shared: word = context.LoadShared(offset); break;
                     }
@@ -394,7 +436,6 @@ namespace Ryujinx.Graphics.Shader.Instructions
 
                 switch (region)
                 {
-                    case MemoryRegion.Global: context.StoreGlobal(offset, value); break;
                     case MemoryRegion.Local:  context.StoreLocal (offset, value); break;
                     case MemoryRegion.Shared: context.StoreShared(offset, value); break;
                 }
@@ -406,9 +447,89 @@ namespace Ryujinx.Graphics.Shader.Instructions
             }
         }
 
+        private static void EmitStoreGlobal(EmitterContext context)
+        {
+            OpCodeMemory op = (OpCodeMemory)context.CurrOp;
+
+            bool isSmallInt = op.Size < IntegerSize.B32;
+
+            int count = GetVectorCount(op.Size);
+
+            (Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, op.Ra, op.Extended, op.Offset);
+
+            Operand bitOffset = GetBitOffset(context, addrLow);
+
+            for (int index = 0; index < count; index++)
+            {
+                Register rd = new Register(op.Rd.Index + index, RegisterType.Gpr);
+
+                Operand value = Register(rd);
+
+                if (isSmallInt)
+                {
+                    Operand word = context.LoadGlobal(addrLow, addrHigh);
+
+                    value = InsertSmallInt(context, op.Size, bitOffset, word, value);
+                }
+
+                context.StoreGlobal(context.IAdd(addrLow, Const(index * 4)), addrHigh, value);
+
+                if (rd.IsRZ)
+                {
+                    break;
+                }
+            }
+        }
+
+        private static int GetVectorCount(IntegerSize size)
+        {
+            switch (size)
+            {
+                case IntegerSize.B64:
+                    return 2;
+                case IntegerSize.B128:
+                case IntegerSize.UB128:
+                    return 4;
+            }
+
+            return 1;
+        }
+
+        private static (Operand, Operand) Get40BitsAddress(
+            EmitterContext context,
+            Register ra,
+            bool extended,
+            int offset)
+        {
+            Operand addrLow = GetSrcA(context);
+            Operand addrHigh;
+
+            if (extended && !ra.IsRZ)
+            {
+                addrHigh = Register(ra.Index + 1, RegisterType.Gpr);
+            }
+            else
+            {
+                addrHigh = Const(0);
+            }
+
+            Operand offs = Const(offset);
+
+            addrLow = context.IAdd(addrLow, offs);
+
+            if (extended)
+            {
+                Operand carry = context.ICompareLessUnsigned(addrLow, offs);
+
+                addrHigh = context.IAdd(addrHigh, context.ConditionalSelect(carry, Const(1), Const(0)));
+            }
+
+            return (addrLow, addrHigh);
+        }
+
         private static Operand GetBitOffset(EmitterContext context, Operand baseOffset)
         {
-            // Note: byte offset = (baseOffset & 0b11) * 8.
+            // Note: bit offset = (baseOffset & 0b11) * 8.
             // Addresses should be always aligned to the integer type,
             // so we don't need to take unaligned addresses into account.
             return context.ShiftLeft(context.BitwiseAnd(baseOffset, Const(3)), Const(3));
diff --git a/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj b/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj
index a046c2f9e8..e10d1edaf7 100644
--- a/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj
+++ b/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj
@@ -1,7 +1,6 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <ItemGroup>
-    <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\GlobalMemory.glsl" />
     <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\Shuffle.glsl" />
     <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\ShuffleDown.glsl" />
     <EmbeddedResource Include="CodeGen\Glsl\HelperFunctions\ShuffleUp.glsl" />
diff --git a/Ryujinx.Graphics.Shader/ShaderCapabilities.cs b/Ryujinx.Graphics.Shader/ShaderCapabilities.cs
new file mode 100644
index 0000000000..939c7c1de6
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/ShaderCapabilities.cs
@@ -0,0 +1,16 @@
+namespace Ryujinx.Graphics.Shader
+{
+    public struct ShaderCapabilities
+    {
+        private static readonly ShaderCapabilities _default = new ShaderCapabilities(16);
+
+        public static ShaderCapabilities Default => _default;
+
+        public int StorageBufferOffsetAlignment { get; }
+
+        public ShaderCapabilities(int storageBufferOffsetAlignment)
+        {
+            StorageBufferOffsetAlignment = storageBufferOffsetAlignment;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/ShaderConfig.cs b/Ryujinx.Graphics.Shader/ShaderConfig.cs
index 3583fa64c7..3088cfbba6 100644
--- a/Ryujinx.Graphics.Shader/ShaderConfig.cs
+++ b/Ryujinx.Graphics.Shader/ShaderConfig.cs
@@ -6,6 +6,8 @@ namespace Ryujinx.Graphics.Shader
     {
         public ShaderStage Stage { get; }
 
+        public ShaderCapabilities Capabilities { get; }
+
         public TranslationFlags Flags { get; }
 
         public int MaxOutputVertices { get; }
@@ -13,12 +15,14 @@ namespace Ryujinx.Graphics.Shader
         public OutputTopology OutputTopology { get; }
 
         public ShaderConfig(
-            ShaderStage      stage,
-            TranslationFlags flags,
-            int              maxOutputVertices,
-            OutputTopology   outputTopology)
+            ShaderStage        stage,
+            ShaderCapabilities capabilities,
+            TranslationFlags   flags,
+            int                maxOutputVertices,
+            OutputTopology     outputTopology)
         {
             Stage             = stage;
+            Capabilities      = capabilities;
             Flags             = flags;
             MaxOutputVertices = maxOutputVertices;
             OutputTopology    = outputTopology;
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs b/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs
index b262e6bc18..e2eee78d92 100644
--- a/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs
+++ b/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs
@@ -5,11 +5,10 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
     [Flags]
     enum HelperFunctionsMask
     {
-        GlobalMemory = 1 << 0,
-        Shuffle      = 1 << 1,
-        ShuffleDown  = 1 << 2,
-        ShuffleUp    = 1 << 3,
-        ShuffleXor   = 1 << 4,
-        SwizzleAdd   = 1 << 5
+        Shuffle     = 1 << 0,
+        ShuffleDown = 1 << 1,
+        ShuffleUp   = 1 << 2,
+        ShuffleXor  = 1 << 3,
+        SwizzleAdd  = 1 << 4
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs b/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs
index 4c456d7bbd..d1874f50ff 100644
--- a/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs
+++ b/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs
@@ -25,16 +25,16 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             _infoTbl = new InstInfo[(int)Instruction.Count];
 
             //  Inst                                  Destination type     Source 1 type        Source 2 type        Source 3 type        Source 4 type
-            Add(Instruction.AtomicAdd,                VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicAnd,                VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicCompareAndSwap,     VariableType.U32,    VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicMaxS32,             VariableType.S32,    VariableType.S32,    VariableType.S32);
-            Add(Instruction.AtomicMaxU32,             VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicMinS32,             VariableType.S32,    VariableType.S32,    VariableType.S32);
-            Add(Instruction.AtomicMinU32,             VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicOr,                 VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicSwap,               VariableType.U32,    VariableType.U32,    VariableType.U32);
-            Add(Instruction.AtomicXor,                VariableType.U32,    VariableType.U32,    VariableType.U32);
+            Add(Instruction.AtomicAdd,                VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
+            Add(Instruction.AtomicAnd,                VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
+            Add(Instruction.AtomicCompareAndSwap,     VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32,    VariableType.U32);
+            Add(Instruction.AtomicMaxS32,             VariableType.S32,    VariableType.S32,    VariableType.S32,    VariableType.S32);
+            Add(Instruction.AtomicMaxU32,             VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
+            Add(Instruction.AtomicMinS32,             VariableType.S32,    VariableType.S32,    VariableType.S32,    VariableType.S32);
+            Add(Instruction.AtomicMinU32,             VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
+            Add(Instruction.AtomicOr,                 VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
+            Add(Instruction.AtomicSwap,               VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
+            Add(Instruction.AtomicXor,                VariableType.U32,    VariableType.S32,    VariableType.S32,    VariableType.U32);
             Add(Instruction.Absolute,                 VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.Add,                      VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.Ballot,                   VariableType.U32,    VariableType.Bool);
@@ -84,7 +84,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             Add(Instruction.LoadGlobal,               VariableType.U32,    VariableType.S32,    VariableType.S32);
             Add(Instruction.LoadLocal,                VariableType.U32,    VariableType.S32);
             Add(Instruction.LoadShared,               VariableType.U32,    VariableType.S32);
-            Add(Instruction.LoadStorage,              VariableType.U32,    VariableType.S32);
+            Add(Instruction.LoadStorage,              VariableType.U32,    VariableType.S32,    VariableType.S32);
             Add(Instruction.LogarithmB2,              VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.LogicalAnd,               VariableType.Bool,   VariableType.Bool,   VariableType.Bool);
             Add(Instruction.LogicalExclusiveOr,       VariableType.Bool,   VariableType.Bool,   VariableType.Bool);
@@ -111,7 +111,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             Add(Instruction.StoreGlobal,              VariableType.None,   VariableType.S32,    VariableType.S32,    VariableType.U32);
             Add(Instruction.StoreLocal,               VariableType.None,   VariableType.S32,    VariableType.U32);
             Add(Instruction.StoreShared,              VariableType.None,   VariableType.S32,    VariableType.U32);
-            Add(Instruction.StoreStorage,             VariableType.None,   VariableType.S32,    VariableType.U32);
+            Add(Instruction.StoreStorage,             VariableType.None,   VariableType.S32,    VariableType.S32,    VariableType.U32);
             Add(Instruction.Subtract,                 VariableType.Scalar, VariableType.Scalar, VariableType.Scalar);
             Add(Instruction.SwizzleAdd,               VariableType.F32,    VariableType.F32,    VariableType.F32,    VariableType.S32);
             Add(Instruction.TextureSample,            VariableType.F32);
diff --git a/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs b/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs
index a81b3d12aa..a85fbae3d2 100644
--- a/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs
+++ b/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs
@@ -1,4 +1,5 @@
 using Ryujinx.Graphics.Shader.IntermediateRepresentation;
+using Ryujinx.Graphics.Shader.Translation;
 using System;
 using System.Collections.Generic;
 
@@ -80,7 +81,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
                 }
                 else if (UsesStorage(inst))
                 {
-                    context.Info.SBuffers.Add(operation.Index);
+                    AddSBufferUse(context.Info.SBuffers, operation);
                 }
 
                 AstAssignment assignment;
@@ -159,7 +160,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             {
                 if (UsesStorage(inst))
                 {
-                    context.Info.SBuffers.Add(operation.Index);
+                    AddSBufferUse(context.Info.SBuffers, operation);
                 }
 
                 context.AddNode(new AstOperation(inst, operation.Index, sources));
@@ -170,10 +171,6 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             // decide which helper functions are needed on the final generated code.
             switch (operation.Inst)
             {
-                case Instruction.LoadGlobal:
-                case Instruction.StoreGlobal:
-                    context.Info.HelperFunctionsMask |= HelperFunctionsMask.GlobalMemory;
-                    break;
                 case Instruction.Shuffle:
                     context.Info.HelperFunctionsMask |= HelperFunctionsMask.Shuffle;
                     break;
@@ -192,6 +189,26 @@ namespace Ryujinx.Graphics.Shader.StructuredIr
             }
         }
 
+        private static void AddSBufferUse(HashSet<int> sBuffers, Operation operation)
+        {
+            Operand slot = operation.GetSource(0);
+
+            if (slot.Type == OperandType.Constant)
+            {
+                sBuffers.Add(slot.Value);
+            }
+            else
+            {
+                // If the value is not constant, then we don't know
+                // how many storage buffers are used, so we assume
+                // all of them are used.
+                for (int index = 0; index < GlobalMemory.StorageMaxCount; index++)
+                {
+                    sBuffers.Add(index);
+                }
+            }
+        }
+
         private static VariableType GetVarTypeFromUses(Operand dest)
         {
             HashSet<Operand> visited = new HashSet<Operand>();
diff --git a/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs b/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs
index df8867e9f7..d884cfdb34 100644
--- a/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs
+++ b/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs
@@ -6,54 +6,54 @@ namespace Ryujinx.Graphics.Shader.Translation
 {
     static class EmitterContextInsts
     {
-        public static Operand AtomicAdd(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicAdd(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicAdd | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicAdd | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicAnd(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicAnd(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicAnd | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicAnd | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicCompareAndSwap(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
+        public static Operand AtomicCompareAndSwap(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c, Operand d)
         {
-            return context.Add(Instruction.AtomicCompareAndSwap | mr, Local(), a, b, c);
+            return context.Add(Instruction.AtomicCompareAndSwap | mr, Local(), a, b, c, d);
         }
 
-        public static Operand AtomicMaxS32(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicMaxS32(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicMaxS32 | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicMaxS32 | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicMaxU32(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicMaxU32(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicMaxU32 | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicMaxU32 | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicMinS32(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicMinS32(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicMinS32 | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicMinS32 | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicMinU32(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicMinU32(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicMinU32 | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicMinU32 | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicOr(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicOr(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicOr | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicOr | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicSwap(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicSwap(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicSwap | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicSwap | mr, Local(), a, b, c);
         }
 
-        public static Operand AtomicXor(this EmitterContext context, Instruction mr, Operand a, Operand b)
+        public static Operand AtomicXor(this EmitterContext context, Instruction mr, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.AtomicXor | mr, Local(), a, b);
+            return context.Add(Instruction.AtomicXor | mr, Local(), a, b, c);
         }
 
         public static Operand Ballot(this EmitterContext context, Operand a)
@@ -461,9 +461,9 @@ namespace Ryujinx.Graphics.Shader.Translation
             return context.Add(Instruction.LoadConstant, Local(), a, b);
         }
 
-        public static Operand LoadGlobal(this EmitterContext context, Operand a)
+        public static Operand LoadGlobal(this EmitterContext context, Operand a, Operand b)
         {
-            return context.Add(Instruction.LoadGlobal, Local(), a);
+            return context.Add(Instruction.LoadGlobal, Local(), a, b);
         }
 
         public static Operand LoadLocal(this EmitterContext context, Operand a)
@@ -523,9 +523,9 @@ namespace Ryujinx.Graphics.Shader.Translation
             return context.Add(Instruction.ShuffleXor, Local(), a, b, c);
         }
 
-        public static Operand StoreGlobal(this EmitterContext context, Operand a, Operand b)
+        public static Operand StoreGlobal(this EmitterContext context, Operand a, Operand b, Operand c)
         {
-            return context.Add(Instruction.StoreGlobal, null, a, b);
+            return context.Add(Instruction.StoreGlobal, null, a, b, c);
         }
 
         public static Operand StoreLocal(this EmitterContext context, Operand a, Operand b)
diff --git a/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs b/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs
new file mode 100644
index 0000000000..4b5dbccb11
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/Translation/GlobalMemory.cs
@@ -0,0 +1,46 @@
+using Ryujinx.Graphics.Shader.IntermediateRepresentation;
+
+namespace Ryujinx.Graphics.Shader.Translation
+{
+    static class GlobalMemory
+    {
+        private const int StorageDescsBaseOffset = 0x44; // In words.
+
+        public const int StorageDescSize = 4; // In words.
+        public const int StorageMaxCount = 16;
+
+        public const int StorageDescsSize  = StorageDescSize * StorageMaxCount;
+
+        public static bool UsesGlobalMemory(Instruction inst)
+        {
+            return (inst.IsAtomic() && IsGlobalMr(inst)) ||
+                    inst == Instruction.LoadGlobal ||
+                    inst == Instruction.StoreGlobal;
+        }
+
+        private static bool IsGlobalMr(Instruction inst)
+        {
+            return (inst & Instruction.MrMask) == Instruction.MrGlobal;
+        }
+
+        public static int GetStorageCbOffset(ShaderStage stage, int slot)
+        {
+            return GetStorageBaseCbOffset(stage) + slot * StorageDescSize;
+        }
+
+        public static int GetStorageBaseCbOffset(ShaderStage stage)
+        {
+            switch (stage)
+            {
+                case ShaderStage.Compute:                return StorageDescsBaseOffset + 2 * StorageDescsSize;
+                case ShaderStage.Vertex:                 return StorageDescsBaseOffset;
+                case ShaderStage.TessellationControl:    return StorageDescsBaseOffset + 1 * StorageDescsSize;
+                case ShaderStage.TessellationEvaluation: return StorageDescsBaseOffset + 2 * StorageDescsSize;
+                case ShaderStage.Geometry:               return StorageDescsBaseOffset + 3 * StorageDescsSize;
+                case ShaderStage.Fragment:               return StorageDescsBaseOffset + 4 * StorageDescsSize;
+            }
+
+            return 0;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Translation/Lowering.cs b/Ryujinx.Graphics.Shader/Translation/Lowering.cs
new file mode 100644
index 0000000000..9a17dd83e6
--- /dev/null
+++ b/Ryujinx.Graphics.Shader/Translation/Lowering.cs
@@ -0,0 +1,121 @@
+using Ryujinx.Graphics.Shader.IntermediateRepresentation;
+using System.Collections.Generic;
+
+using static Ryujinx.Graphics.Shader.IntermediateRepresentation.OperandHelper;
+using static Ryujinx.Graphics.Shader.Translation.GlobalMemory;
+
+namespace Ryujinx.Graphics.Shader.Translation
+{
+    static class Lowering
+    {
+        public static void RunPass(BasicBlock[] blocks, ShaderConfig config)
+        {
+            for (int blkIndex = 0; blkIndex < blocks.Length; blkIndex++)
+            {
+                BasicBlock block = blocks[blkIndex];
+
+                for (LinkedListNode<INode> node = block.Operations.First; node != null; node = node.Next)
+                {
+                    if (!(node.Value is Operation operation))
+                    {
+                        continue;
+                    }
+
+                    if (UsesGlobalMemory(operation.Inst))
+                    {
+                        node = LowerGlobal(node, config);
+                    }
+                }
+            }
+        }
+
+        private static LinkedListNode<INode> LowerGlobal(LinkedListNode<INode> node, ShaderConfig config)
+        {
+            Operation operation = (Operation)node.Value;
+
+            Operation storageOp;
+
+            Operand PrependOperation(Instruction inst, params Operand[] sources)
+            {
+                Operand local = Local();
+
+                node.List.AddBefore(node, new Operation(inst, local, sources));
+
+                return local;
+            }
+
+            Operand addrLow  = operation.GetSource(0);
+            Operand addrHigh = operation.GetSource(1);
+
+            Operand sbBaseAddrLow = Const(0);
+            Operand sbSlot        = Const(0);
+
+            for (int slot = 0; slot < StorageMaxCount; slot++)
+            {
+                int cbOffset = GetStorageCbOffset(config.Stage, slot);
+
+                Operand baseAddrLow  = Cbuf(0, cbOffset);
+                Operand baseAddrHigh = Cbuf(0, cbOffset + 1);
+                Operand size         = Cbuf(0, cbOffset + 2);
+
+                Operand offset = PrependOperation(Instruction.Subtract,       addrLow, baseAddrLow);
+                Operand borrow = PrependOperation(Instruction.CompareLessU32, addrLow, baseAddrLow);
+
+                Operand inRangeLow = PrependOperation(Instruction.CompareLessU32, offset, size);
+
+                Operand addrHighBorrowed = PrependOperation(Instruction.Add, addrHigh, borrow);
+
+                Operand inRangeHigh = PrependOperation(Instruction.CompareEqual, addrHighBorrowed, baseAddrHigh);
+
+                Operand inRange = PrependOperation(Instruction.BitwiseAnd, inRangeLow, inRangeHigh);
+
+                sbBaseAddrLow = PrependOperation(Instruction.ConditionalSelect, inRange, baseAddrLow, sbBaseAddrLow);
+                sbSlot        = PrependOperation(Instruction.ConditionalSelect, inRange, Const(slot), sbSlot);
+            }
+
+            Operand alignMask = Const(-config.Capabilities.StorageBufferOffsetAlignment);
+
+            Operand baseAddrTrunc = PrependOperation(Instruction.BitwiseAnd,    sbBaseAddrLow, Const(-64));
+            Operand byteOffset    = PrependOperation(Instruction.Subtract,      addrLow, baseAddrTrunc);
+            Operand wordOffset    = PrependOperation(Instruction.ShiftRightU32, byteOffset, Const(2));
+
+            Operand[] sources = new Operand[operation.SourcesCount];
+
+            sources[0] = sbSlot;
+            sources[1] = wordOffset;
+
+            for (int index = 2; index < operation.SourcesCount; index++)
+            {
+                sources[index] = operation.GetSource(index);
+            }
+
+            if (operation.Inst.IsAtomic())
+            {
+                Instruction inst = (operation.Inst & ~Instruction.MrMask) | Instruction.MrStorage;
+
+                storageOp = new Operation(inst, operation.Dest, sources);
+            }
+            else if (operation.Inst == Instruction.LoadGlobal)
+            {
+                storageOp = new Operation(Instruction.LoadStorage, operation.Dest, sources);
+            }
+            else
+            {
+                storageOp = new Operation(Instruction.StoreStorage, null, sources);
+            }
+
+            for (int index = 0; index < operation.SourcesCount; index++)
+            {
+                operation.SetSource(index, null);
+            }
+
+            LinkedListNode<INode> oldNode = node;
+
+            node = node.List.AddBefore(node, storageOp);
+
+            node.List.Remove(oldNode);
+
+            return node;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs
index 2fafa5add9..639f9ba4b4 100644
--- a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs
+++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs
@@ -1,20 +1,16 @@
 using Ryujinx.Graphics.Shader.IntermediateRepresentation;
 using System.Collections.Generic;
 
+using static Ryujinx.Graphics.Shader.IntermediateRepresentation.OperandHelper;
+using static Ryujinx.Graphics.Shader.Translation.GlobalMemory;
+
 namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 {
     static class GlobalToStorage
     {
-        private const int StorageDescsBaseOffset = 0x44; // In words.
-
-        private const int StorageDescSize = 4; // In words.
-        private const int StorageMaxCount = 16;
-
-        private const int StorageDescsSize  = StorageDescSize * StorageMaxCount;
-
-        public static void RunPass(BasicBlock block, ShaderStage stage)
+        public static void RunPass(BasicBlock block, ShaderConfig config)
         {
-            int sbStart = GetStorageBaseCbOffset(stage);
+            int sbStart = GetStorageBaseCbOffset(config.Stage);
 
             int sbEnd = sbStart + StorageDescsSize;
 
@@ -25,9 +21,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
                     continue;
                 }
 
-                if (operation.Inst.IsAtomic() ||
-                    operation.Inst == Instruction.LoadGlobal ||
-                    operation.Inst == Instruction.StoreGlobal)
+                if (UsesGlobalMemory(operation.Inst))
                 {
                     Operand source = operation.GetSource(0);
 
@@ -37,44 +31,68 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 
                         if (storageIndex >= 0)
                         {
-                            node = ReplaceGlobalWithStorage(node, storageIndex);
+                            node = ReplaceGlobalWithStorage(node, config, storageIndex);
                         }
                     }
                 }
             }
         }
 
-        private static LinkedListNode<INode> ReplaceGlobalWithStorage(LinkedListNode<INode> node, int storageIndex)
+        private static LinkedListNode<INode> ReplaceGlobalWithStorage(LinkedListNode<INode> node, ShaderConfig config, int storageIndex)
         {
             Operation operation = (Operation)node.Value;
 
             Operation storageOp;
 
+            Operand GetStorageOffset()
+            {
+                Operand addrLow = operation.GetSource(0);
+
+                Operand baseAddrLow  = Cbuf(0, GetStorageCbOffset(config.Stage, storageIndex));
+
+                Operand baseAddrTrunc = Local();
+
+                Operand alignMask = Const(-config.Capabilities.StorageBufferOffsetAlignment);
+
+                Operation andOp = new Operation(Instruction.BitwiseAnd, baseAddrTrunc, baseAddrLow, alignMask);
+
+                node.List.AddBefore(node, andOp);
+
+                Operand byteOffset = Local();
+                Operand wordOffset = Local();
+
+                Operation subOp = new Operation(Instruction.Subtract,      byteOffset, addrLow, baseAddrTrunc);
+                Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2));
+
+                node.List.AddBefore(node, subOp);
+                node.List.AddBefore(node, shrOp);
+
+                return wordOffset;
+            }
+
+            Operand[] sources = new Operand[operation.SourcesCount];
+
+            sources[0] = Const(storageIndex);
+            sources[1] = GetStorageOffset();
+
+            for (int index = 2; index < operation.SourcesCount; index++)
+            {
+                sources[index] = operation.GetSource(index);
+            }
+
             if (operation.Inst.IsAtomic())
             {
-                Operand[] sources = new Operand[operation.SourcesCount];
-
-                for (int index = 0; index < operation.SourcesCount; index++)
-                {
-                    sources[index] = operation.GetSource(index);
-                }
-
                 Instruction inst = (operation.Inst & ~Instruction.MrMask) | Instruction.MrStorage;
 
-                storageOp = new Operation(inst, storageIndex, operation.Dest, sources);
+                storageOp = new Operation(inst, operation.Dest, sources);
             }
             else if (operation.Inst == Instruction.LoadGlobal)
             {
-                Operand source = operation.GetSource(0);
-
-                storageOp = new Operation(Instruction.LoadStorage, storageIndex, operation.Dest, source);
+                storageOp = new Operation(Instruction.LoadStorage, operation.Dest, sources);
             }
             else
             {
-                Operand src1 = operation.GetSource(0);
-                Operand src2 = operation.GetSource(1);
-
-                storageOp = new Operation(Instruction.StoreStorage, storageIndex, null, src1, src2);
+                storageOp = new Operation(Instruction.StoreStorage, null, sources);
             }
 
             for (int index = 0; index < operation.SourcesCount; index++)
@@ -84,7 +102,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 
             LinkedListNode<INode> oldNode = node;
 
-            node = node.List.AddAfter(node, storageOp);
+            node = node.List.AddBefore(node, storageOp);
 
             node.List.Remove(oldNode);
 
@@ -125,25 +143,5 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 
             return -1;
         }
-
-        public static int GetStorageCbOffset(ShaderStage stage, int slot)
-        {
-            return GetStorageBaseCbOffset(stage) + slot * StorageDescSize;
-        }
-
-        private static int GetStorageBaseCbOffset(ShaderStage stage)
-        {
-            switch (stage)
-            {
-                case ShaderStage.Compute:                return StorageDescsBaseOffset + 2 * StorageDescsSize;
-                case ShaderStage.Vertex:                 return StorageDescsBaseOffset;
-                case ShaderStage.TessellationControl:    return StorageDescsBaseOffset + 1 * StorageDescsSize;
-                case ShaderStage.TessellationEvaluation: return StorageDescsBaseOffset + 2 * StorageDescsSize;
-                case ShaderStage.Geometry:               return StorageDescsBaseOffset + 3 * StorageDescsSize;
-                case ShaderStage.Fragment:               return StorageDescsBaseOffset + 4 * StorageDescsSize;
-            }
-
-            return 0;
-        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs
index 93d86541ff..c5db4678b7 100644
--- a/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs
+++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/Optimizer.cs
@@ -7,11 +7,11 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations
 {
     static class Optimizer
     {
-        public static void Optimize(BasicBlock[] blocks, ShaderStage stage)
+        public static void RunPass(BasicBlock[] blocks, ShaderConfig config)
         {
             for (int blkIndex = 0; blkIndex < blocks.Length; blkIndex++)
             {
-                GlobalToStorage.RunPass(blocks[blkIndex], stage);
+                GlobalToStorage.RunPass(blocks[blkIndex], config);
             }
 
             bool modified;
diff --git a/Ryujinx.Graphics.Shader/Translation/Translator.cs b/Ryujinx.Graphics.Shader/Translation/Translator.cs
index 1c37fa70ed..b129be9397 100644
--- a/Ryujinx.Graphics.Shader/Translation/Translator.cs
+++ b/Ryujinx.Graphics.Shader/Translation/Translator.cs
@@ -47,7 +47,7 @@ namespace Ryujinx.Graphics.Shader.Translation
             return code.Slice(0, headerSize + (int)endAddress);
         }
 
-        public static ShaderProgram Translate(Span<byte> code, TranslationFlags flags)
+        public static ShaderProgram Translate(Span<byte> code, ShaderCapabilities capabilities, TranslationFlags flags)
         {
             bool compute   = (flags & TranslationFlags.Compute)   != 0;
             bool debugMode = (flags & TranslationFlags.DebugMode) != 0;
@@ -82,6 +82,7 @@ namespace Ryujinx.Graphics.Shader.Translation
 
             ShaderConfig config = new ShaderConfig(
                 stage,
+                capabilities,
                 flags,
                 maxOutputVertexCount,
                 outputTopology);
@@ -89,7 +90,7 @@ namespace Ryujinx.Graphics.Shader.Translation
             return Translate(ops, config, size);
         }
 
-        public static ShaderProgram Translate(Span<byte> vpACode, Span<byte> vpBCode, TranslationFlags flags)
+        public static ShaderProgram Translate(Span<byte> vpACode, Span<byte> vpBCode, ShaderCapabilities capabilities, TranslationFlags flags)
         {
             bool debugMode = (flags & TranslationFlags.DebugMode) != 0;
 
@@ -98,6 +99,7 @@ namespace Ryujinx.Graphics.Shader.Translation
 
             ShaderConfig config = new ShaderConfig(
                 header.Stage,
+                capabilities,
                 flags,
                 header.MaxOutputVertexCount,
                 header.OutputTopology);
@@ -107,20 +109,22 @@ namespace Ryujinx.Graphics.Shader.Translation
 
         private static ShaderProgram Translate(Operation[] ops, ShaderConfig config, int size)
         {
-            BasicBlock[] irBlocks = ControlFlowGraph.MakeCfg(ops);
+            BasicBlock[] blocks = ControlFlowGraph.MakeCfg(ops);
 
-            if (irBlocks.Length > 0)
+            if (blocks.Length > 0)
             {
-                Dominance.FindDominators(irBlocks[0], irBlocks.Length);
+                Dominance.FindDominators(blocks[0], blocks.Length);
 
-                Dominance.FindDominanceFrontiers(irBlocks);
+                Dominance.FindDominanceFrontiers(blocks);
 
-                Ssa.Rename(irBlocks);
+                Ssa.Rename(blocks);
 
-                Optimizer.Optimize(irBlocks, config.Stage);
+                Optimizer.RunPass(blocks, config);
+
+                Lowering.RunPass(blocks, config);
             }
 
-            StructuredProgramInfo sInfo = StructuredProgram.MakeStructuredProgram(irBlocks, config);
+            StructuredProgramInfo sInfo = StructuredProgram.MakeStructuredProgram(blocks, config);
 
             GlslProgram program = GlslGenerator.Generate(sInfo, config);
 
diff --git a/Ryujinx.ShaderTools/Program.cs b/Ryujinx.ShaderTools/Program.cs
index 6fa043a3c1..275da794d3 100644
--- a/Ryujinx.ShaderTools/Program.cs
+++ b/Ryujinx.ShaderTools/Program.cs
@@ -1,4 +1,5 @@
-using Ryujinx.Graphics.Shader.Translation;
+using Ryujinx.Graphics.Shader;
+using Ryujinx.Graphics.Shader.Translation;
 using System;
 using System.IO;
 
@@ -19,7 +20,7 @@ namespace Ryujinx.ShaderTools
 
                 byte[] data = File.ReadAllBytes(args[args.Length - 1]);
 
-                string code = Translator.Translate(data, flags).Code;
+                string code = Translator.Translate(data, ShaderCapabilities.Default, flags).Code;
 
                 Console.WriteLine(code);
             }