94 files changed, 2634 insertions, 839 deletions
diff --git a/CLAUDE.md b/CLAUDE.md
index de04208..3d41cb7 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -32,6 +32,12 @@
 #
 # Testing & Tools:
 #   doc/test_demo_README.md - test_demo tool documentation
+#
+# Architecture & Reference:
+#   doc/ARCHITECTURE.md - Detailed system architecture
+#   doc/CODING_STYLE.md - Code style examples
+#   doc/BACKLOG.md - Untriaged future goals
+#   doc/TOOLS_REFERENCE.md - Developer tools reference
 
 # ============================================
 # TIER 4: HISTORICAL ARCHIVE (Load Rarely)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f939bc..fb6beef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -593,6 +593,22 @@ if(DEMO_BUILD_TESTS)
     target_link_libraries(test_texture_manager PRIVATE 3d gpu audio procedural util ${DEMO_LIBS})
     add_dependencies(test_texture_manager generate_demo_assets)
 
+    # GPU Procedural Texture Test
+    add_demo_test(test_gpu_procedural GpuProceduralTest
+        src/tests/test_gpu_procedural.cc
+        ${PLATFORM_SOURCES}
+        ${GEN_DEMO_CC})
+    target_link_libraries(test_gpu_procedural PRIVATE 3d gpu audio procedural util ${DEMO_LIBS})
+    add_dependencies(test_gpu_procedural generate_demo_assets)
+
+    # GPU Composite Texture Test (Phase 4)
+    add_demo_test(test_gpu_composite GpuCompositeTest
+        src/tests/test_gpu_composite.cc
+        ${PLATFORM_SOURCES}
+        ${GEN_DEMO_CC})
+    target_link_libraries(test_gpu_composite PRIVATE 3d gpu audio procedural util ${DEMO_LIBS})
+    add_dependencies(test_gpu_composite generate_demo_assets)
+
     # Gantt chart output test (bash script)
     add_test(
         NAME GanttOutputTest
@@ -612,46 +628,42 @@ if(DEMO_BUILD_TESTS)
     )
 endif()
 
-#-- - Extra Tools -- - 
-if(DEMO_BUILD_TOOLS OR DEMO_BUILD_TESTS)
-    add_demo_executable(spectool tools/spectool.cc ${PLATFORM_SOURCES} ${GEN_DEMO_CC} ${GENERATED_MUSIC_DATA_CC})
-    target_compile_definitions(spectool PRIVATE DEMO_BUILD_TOOLS)
-    target_link_libraries(spectool PRIVATE audio util procedural ${DEMO_LIBS})
-    add_dependencies(spectool generate_tracker_music generate_demo_assets)
+# Sub-task 7: Integrate validation tool into CI/build system
 
-    add_executable(specview tools/specview.cc)
+# Ensure the Python validation script is available
+add_custom_target(validate_uniforms_script ALL DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/tools/validate_uniforms.py)
 
-    add_demo_executable(specplay tools/specplay.cc ${PLATFORM_SOURCES} ${GEN_DEMO_CC} ${GENERATED_MUSIC_DATA_CC})
-    target_link_libraries(specplay PRIVATE audio util ${DEMO_LIBS})
-    add_dependencies(specplay generate_demo_assets)
-endif()
+# Find all WGSL files recursively in src/gpu
+file(GLOB WGSL_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/*.wgsl ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/**/*.wgsl)
 
-#-- - Global Target Configuration -- -
-# NOTE: "final" target moved to line ~329 (FINAL_STRIP build)
-# Old "final" target (gen_assets + crunch_demo) removed - run scripts manually
+# List of C++ files containing uniform struct definitions and shader code
+# Add more C++ files here if new effects with structs are added.
+set(VALIDATION_CPP_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/heptagon_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/post_process_helper.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/fade_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/theme_modulation_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/chroma_aberration_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/vignette_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/gaussian_blur_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/distort_effect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/demo_effects.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/gpu/effects/circle_mask_effect.h
+)
 
-add_custom_target(pack_source
-    COMMAND tar -czf demo_all.tgz --exclude=.git --exclude=build* --exclude=.gemini* --exclude=*.tgz --exclude=*.zip --exclude=.DS_Store .
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+# Add custom command to run the validator
+# It depends on the script itself, WGSL files, and the C++ files being validated.
+# Outputting a flag file to signal completion.
+set(VALIDATION_FLAG ${CMAKE_CURRENT_BINARY_DIR}/uniform_validation_complete.flag)
+add_custom_command(
+    OUTPUT ${VALIDATION_FLAG}
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/tools/validate_uniforms.py ${VALIDATION_FLAG}
+    COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/tools/validate_uniforms.py ${CMAKE_CURRENT_SOURCE_DIR}/assets/final/shaders ${VALIDATION_CPP_FILES}
+    DEPENDS validate_uniforms_script ${WGSL_FILES} ${VALIDATION_CPP_FILES}
+    COMMENT "Validating uniform buffer sizes and alignments..."
 )
 
-#-- - Configuration Summary -- -
-message(STATUS "")
-message(STATUS "═══════════════════════════════════════════════════════════")
-message(STATUS "  64k Demo Project - Configuration Summary")
-message(STATUS "═══════════════════════════════════════════════════════════")
-message(STATUS "")
-message(STATUS "Build Options:")
-message(STATUS "  DEMO_SIZE_OPT:           ${DEMO_SIZE_OPT}")
-message(STATUS "  DEMO_STRIP_ALL:          ${DEMO_STRIP_ALL}")
-message(STATUS "  DEMO_BUILD_TESTS:        ${DEMO_BUILD_TESTS}")
-message(STATUS "  DEMO_BUILD_TOOLS:        ${DEMO_BUILD_TOOLS}")
-message(STATUS "  DEMO_ENABLE_COVERAGE:    ${DEMO_ENABLE_COVERAGE}")
-message(STATUS "  DEMO_ENABLE_DEBUG_LOGS:  ${DEMO_ENABLE_DEBUG_LOGS}")
-message(STATUS "  DEMO_ALL_OPTIONS:        ${DEMO_ALL_OPTIONS}")
-message(STATUS "")
-message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}")
-message(STATUS "C++ Compiler: ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
-message(STATUS "")
-message(STATUS "═══════════════════════════════════════════════════════════")
-message(STATUS "")
+# Add custom target that depends on the validation output flag
+add_custom_target(validate_uniforms ALL DEPENDS ${VALIDATION_FLAG})
+
+
diff --git a/GEMINI.md b/GEMINI.md
index a9de297..6fa2692 100644
--- a/GEMINI.md
+++ b/GEMINI.md
@@ -33,6 +33,12 @@
 #
 # Testing & Tools:
 #   @doc/test_demo_README.md - test_demo tool documentation
+#
+# Architecture & Reference:
+#   @doc/ARCHITECTURE.md - Detailed system architecture
+#   @doc/CODING_STYLE.md - Code style examples
+#   @doc/BACKLOG.md - Untriaged future goals
+#   @doc/TOOLS_REFERENCE.md - Developer tools reference
 
 # ============================================
 # TIER 4: HISTORICAL ARCHIVE (Load Rarely)
@@ -104,14 +110,15 @@ IMPORTANT:
     </artifact_trail>
 
     <recent_actions>
-        - Finished debugging and fixing the `DemoEffectsTest` SEGFAULT.
-        - Confirmed that all 33 tests are passing.
-        - Updated `GEMINI.md` to reflect the successful completion of Task #74 and set the stage for Task #75.
+        - Completed Task #75: WGSL Uniform Buffer Validation & Consolidation.
+        - Standardized uniform usage across effects (Distort, Fade, ThemeModulation, CircleMask).
+        - Created and integrated `tools/validate_uniforms.py` into the build system.
+        - Added `doc/UNIFORM_BUFFER_GUIDELINES.md` and updated `CONTRIBUTING.md`.
     </recent_actions>
 
     <task_state>
         1. [COMPLETED] Task #74: Fix `DemoEffectsTest` SEGFAULT.
-        2. [IN PROGRESS] Task #75: WGSL Uniform Buffer Validation & Consolidation.
+        2. [COMPLETED] Task #75: WGSL Uniform Buffer Validation & Consolidation.
         3. [PAUSED] Task #5: Spectral Brush Editor.
         4. [PAUSED] Task #18: 3D System Enhancements.
     </task_state>
diff --git a/PROJECT_CONTEXT.md b/PROJECT_CONTEXT.md
index 636f339..181bffc 100644
--- a/PROJECT_CONTEXT.md
+++ b/PROJECT_CONTEXT.md
@@ -1,154 +1,90 @@
 # 64k Demo Project
 
-Goal:
+## Goal
 - Produce a <=64k native demo binary
 - Same C++ codebase for Windows, macOS, Linux
 
-Graphics:
+## Graphics
 - WebGPU via wgpu-native
 - WGSL shaders
 - Hybrid rendering: Rasterized proxy geometry + SDF raymarching
 
-Audio:
+## Audio
 - 32 kHz, 16-bit stereo
 - Procedurally generated samples
 - Real-time additive synthesis from spectrograms (IDCT)
 - Variable tempo system with music time abstraction
 - Event-based pattern triggering for dynamic tempo scaling
 - Modifiable Loops and Patterns, w/ script to generate them (like a Tracker)
-- Unified AudioEngine for lifecycle management (eliminates initialization fragility)
+- Unified AudioEngine for lifecycle management
 
-Constraints:
+## Constraints
 - Size-sensitive
 - Minimal dependencies
 - Explicit control over all allocations
 
-Style:
+## Style
 - Demoscene
 - No engine abstractions
 
 ---
-## Project Roadmap
 
-**Note:** For detailed history of recently completed milestones, see `COMPLETED.md`.
+## Current Status
 
-### Current Status
-- Audio system: Sample-accurate synchronization achieved. Uses hardware playback time as master clock. Variable tempo support integrated. **Pipeline optimized (Task #72)**: Zero heap allocations per frame, direct ring buffer writes, explicit clipping. Comprehensive test coverage maintained.
+- Audio system: Sample-accurate synchronization. Hardware playback time as master clock. Variable tempo support. Pipeline optimized (Task #72): Zero heap allocations per frame, direct ring buffer writes. Comprehensive test coverage.
 - Build system: Optimized with proper asset dependency tracking
-- Shader system: **Parameterization complete**: UniformHelper template, per-frame dynamic params, .seq syntax support. Modular with comprehensive compilation tests. **WGSL composability improved**: Common utilities extracted (`math/common_utils.wgsl`) with 12 call sites deduplicated across renderer shaders.
-- 3D rendering: Hybrid SDF/rasterization with BVH acceleration and binary scene loader. **Object data loading and parsing pipeline enhanced for primitives (e.g., plane_distance).**
-- Asset pipeline: Blender export script and binary scene ingestion supported
-- Error handling: **Dual macro system**: `FATAL_XXX` for programming errors (abort), `CHECK_RETURN` for recoverable errors (graceful return). Messages stripped in STRIP_ALL builds.
-- Testing: **32/33 tests passing (97%)** - Uniform buffer alignment fixed (Task #74). DemoEffectsTest fails due to wgpu_native library bug (not project code).
+- Shader system: Parameterization complete (UniformHelper, .seq syntax). Modular with compilation tests. WGSL composability improved (`math/common_utils.wgsl`).
+- 3D rendering: Hybrid SDF/rasterization with BVH acceleration and binary scene loader. Object data loading pipeline enhanced.
+- Asset pipeline: Blender export script and binary scene ingestion
+- Error handling: Dual macro system (`FATAL_XXX` for programming errors, `CHECK_RETURN` for recoverable errors)
+- Testing: **32/33 tests passing (97%)** - DemoEffectsTest fails due to wgpu_native library bug
 
 ---
+
 ## Next Up
 
-- **Task #5: Spectral Brush Editor** [IN PROGRESS - February 6, 2026]
-    - Create web-based tool for procedurally tracing audio spectrograms
+- **Task #5: Spectral Brush Editor** [IN PROGRESS]
+    - Web-based tool for procedurally tracing audio spectrograms
     - Replace large .spec assets with tiny C++ code (50-100× compression)
-    - Phase 1: C++ runtime (`spectral_brush.h/cc` - Bezier curves + Gaussian profiles)
-    - Phase 2: Editor UI (HTML/JS canvas, dual-layer visualization, keyboard shortcuts)
-    - Phase 3: File I/O (load .wav/.spec, export procedural_params.txt + C++ code)
-    - See `doc/SPECTRAL_BRUSH_EDITOR.md` for complete design
-
-- **Task #72: Audio Pipeline Streamlining** [COMPLETED - February 8, 2026]
-    - ✅ Optimize data flow: Zero heap allocations per frame achieved
-    - ✅ Direct additive mixing: Ring buffer two-phase write API
-    - ✅ Precision: float32 internal pipeline with explicit clipping
+    - See TODO.md and `doc/SPECTRAL_BRUSH_EDITOR.md`
 
 - **Visuals & Content**
-    - [ ] **Task #52: Procedural SDF Font**: Minimal bezier/spline set for [A-Z, 0-9] and SDF rendering.
-    - [ ] **Task #53: Particles Shader Polish**: Improve visual quality of particles.
-    - [ ] **Task #55: SDF Random Planes Intersection**: Implement `sdPolyhedron` (crystal/gem shapes) via plane intersection.
+    - Task #52: Procedural SDF Font
+    - Task #53: Particles Shader Polish
+    - Task #55: SDF Random Planes Intersection
 
 - **Tooling & Optimization**
-    - [ ] **Task #54: Tracy Integration**: Integrate Tracy debugger for performance profiling.
-    - [x] **Task #39: Visual Debugging System**: Implemented wireframe primitives (Sphere, Cone, Cross, Trajectory) for debugging.
+    - Task #54: Tracy Integration
 
 ---
+
 ## Design Docs Quick Reference
 
 For detailed documentation, use Read tool to load specific docs:
 
-- **doc/TRACKER.md**: Audio pattern system with unit-less timing (1 unit = 4 beats). Text-based music score compiled to C++ runtime.
-- **doc/3D.md**: Hybrid SDF raymarching with BVH acceleration and Position Based Dynamics physics.
-- **doc/ASSET_SYSTEM.md**: Build-time asset packer with 16-byte alignment, enum-based O(1) retrieval, procedural generation support.
-- **doc/BUILD.md**: Multi-platform builds (Debug/STRIP_ALL/FINAL_STRIP), cross-compilation, size reporting.
-- **doc/SPECTRAL_BRUSH_EDITOR.md**: Web tool for tracing spectrograms with Bezier curves (50-100× compression).
-- **doc/SEQUENCE.md**: .seq timeline format with BPM notation, priority modifiers, Gantt visualization.
-- **doc/MASKING_SYSTEM.md**: Auxiliary texture registry for inter-effect screen-space partitioning.
-- **doc/SCENE_FORMAT.md**: Binary scene format (SCN1) with object transforms, physics, mesh references.
-- **doc/test_demo_README.md**: 16s audio/visual sync test tool with tempo variation and peak logging.
-- **doc/CONTEXT_MAINTENANCE.md**: Context hygiene protocol (archive to COMPLETED.md monthly, keep Tier 1 files lean).
+- **doc/TRACKER.md**: Audio pattern system with unit-less timing
+- **doc/3D.md**: Hybrid SDF raymarching with BVH acceleration
+- **doc/ASSET_SYSTEM.md**: Build-time asset packer with 16-byte alignment
+- **doc/BUILD.md**: Multi-platform builds (Debug/STRIP_ALL/FINAL_STRIP)
+- **doc/SPECTRAL_BRUSH_EDITOR.md**: Web tool for tracing spectrograms
+- **doc/SEQUENCE.md**: .seq timeline format with BPM notation
+- **doc/MASKING_SYSTEM.md**: Auxiliary texture registry
+- **doc/SCENE_FORMAT.md**: Binary scene format (SCN1)
+- **doc/test_demo_README.md**: 16s audio/visual sync test tool
+- **doc/CONTEXT_MAINTENANCE.md**: Context hygiene protocol
 
 ---
-## Future Goals
-- **Task #36: Blender Exporter**: Create script to export scenes to internal binary format. (Deprioritized)
-- **Task #21: Shader Optimization**
-    - [ ] Use macros or code generation to factorize common WGSL code (normals, bump, lighting).
-    - [ ] Implement Tri-planar mapping for better procedural textures.
-- [ ] **Task #18-B: GPU BVH & Shadows**: Optimize scene queries with a GPU-based BVH.
-- **Phase 2: Advanced Size Optimization**
-    - [ ] **Task #22: Windows Native Platform**: Replace GLFW with minimal native Windows API.
-    - [ ] **Task #28: Spectrogram Quantization**: Quantize spectrograms to logarithmic frequency and uint16_t.
-    - [ ] **Task #35: CRT Replacement**: Investigation and implementation of CRT-free entry point.
-
----
-*For a detailed list of all completed tasks, see the git history.*
 
 ## Recently Completed (February 2026)
 
-- **Uniform Buffer Alignment Fix** (February 9) - Task #74: Fixed WebGPU validation errors caused by WGSL `vec3<f32>` alignment mismatches. Changed circle_mask_compute.wgsl padding from `vec3<f32>` to three `f32` fields. Demo now runs with 0 validation errors. Test suite: 32/33 passing (97%).
-
-- **Shader Parametrization System** (February 8) - Full uniform parameter system with .seq syntax support. FlashEffect now supports dynamic color/decay parameters computed per-frame. Critical WGSL alignment bugfix (vec3 = 16-byte aligned). Size: ~400-500 bytes. See `doc/COMPLETED.md` for details.
+- **WGSL Uniform Buffer Validation (Task #75)** (Feb 9) - Standardized uniform buffer layout. Validation tool integrated into build. All effects use `CommonPostProcessUniforms` (binding 2) + effect-specific params (binding 3). Added `UNIFORM_BUFFER_GUIDELINES.md`.
 
-- **Extended Shader Parametrization** (February 8) - Task #73 (2/4 effects complete):
-  - ChromaAberrationEffect: Added offset_scale and angle parameters (diagonal/vertical aberration modes)
-  - GaussianBlurEffect: Added strength parameter (configurable blur radius)
-  - Both effects follow FlashEffect pattern (UniformHelper, params struct, .seq syntax)
-  - Size: ~200-300 bytes per effect
+- **Uniform Buffer Alignment (Task #74)** (Feb 9) - Fixed WGSL `vec3<f32>` alignment issues. Demo runs with 0 validation errors.
 
-- **WGSL Shader Composability** - Extracted common utilities to `math/common_utils.wgsl`:
-  - `transform_normal()` - 2 call sites (renderer_3d, mesh_render)
-  - `spherical_uv()` / `spherical_uv_from_dir()` - 8 call sites (renderer_3d, skybox)
-  - `grid_pattern()` - 2 call sites (renderer_3d)
-  - Size savings: ~200 bytes net
+- **Shader Parametrization (Task #73)** (Feb 8) - Full uniform parameter system with .seq syntax. FlashEffect, ChromaAberrationEffect, GaussianBlurEffect support dynamic parameters. Size: ~400-500 bytes.
 
-- **Test Suite Optimization** - JitteredAudioBackendTest: 3.5s → 0.07s (50x speedup)
-  - Reduced test duration and sleep times
-  - Full CI suite now <1 second
-
-- **CHECK_RETURN Macro System** - Error handling for recoverable errors:
-  - `CHECK_RETURN_IF()` - Simple validation with return
-  - `CHECK_RETURN_BEGIN/END` - Complex validation with cleanup
-  - `WARN_IF()` - Non-fatal warnings
-  - Applied to 5 call sites (asset_manager, test_demo)
-  - Size impact: ~500 bytes saved in STRIP_ALL builds
-
-## Architectural Overview
-
-### Hybrid 3D Renderer
-- **Core Idea**: Uses standard rasterization to draw proxy hulls (boxes), then raymarches inside the fragment shader to find the exact SDF surface.
-- **Transforms**: Uses `inv_model` matrices to perform all raymarching in local object space, handling rotation and non-uniform scaling correctly.
-- **Shadows**: Instance-based shadow casting with self-shadowing prevention (`skip_idx`).
-
-### Sequence & Effect System
-- **Effect**: Abstract base for visual elements. Supports `compute` and `render` phases.
-- **Sequence**: Timeline of effects with start/end times.
-- **MainSequence**: Top-level coordinator and framebuffer manager.
-- **seq_compiler**: Transpiles `assets/demo.seq` into C++ `timeline.cc`.
+---
 
-### Asset & Build System
-- **asset_packer**: Embeds binary assets (like `.spec` files) into C++ arrays.
-- **Runtime Manager**: O(1) retrieval with lazy procedural generation support.
-- **Automation**: `gen_assets.sh`, `build_win.sh`, and `check_all.sh` for multi-platform validation.
+For detailed architecture, see `doc/ARCHITECTURE.md`.
 
-### Audio Engine
-- **Synthesis**: Real-time additive synthesis from spectrograms via FFT-based IDCT (O(N log N)). Stereo output (32kHz, 16-bit, interleaved L/R). Uses orthonormal DCT-II/DCT-III transforms with Numerical Recipes reordering method.
-- **Variable Tempo**: Music time abstraction with configurable tempo_scale. Tempo changes don't affect pitch.
-- **Event-Based Tracker**: Individual TrackerEvents trigger as separate voices with dynamic beat calculation. Notes within patterns respect tempo scaling.
-- **Backend Abstraction**: `AudioBackend` interface with `MiniaudioBackend` (production), `MockAudioBackend` (testing), and `WavDumpBackend` (offline rendering).
-- **Dynamic Updates**: Double-buffered spectrograms for live thread-safe updates.
-- **Procedural Library**: Melodies and spectral filters (noise, comb) generated at runtime.
-- **Pattern System**: TrackerPatterns contain lists of TrackerEvents (beat, sample_id, volume, pan). Events trigger individually based on elapsed music time.
-\ No newline at end of file
+For completed tasks history, see `doc/COMPLETED.md` and git history.
diff --git a/TODO.md b/TODO.md
index 4b5819b..10f0661 100644
--- a/TODO.md
+++ b/TODO.md
@@ -2,153 +2,39 @@
 
 This file tracks prioritized tasks with detailed attack plans.
 
-**Note:** For a history of recently completed tasks, see `COMPLETED.md`.
-
-## Recently Completed (February 9, 2026)
-
-- [x] **Uniform Buffer Alignment (Task #74)**: Fixed WGSL struct alignment issues across multiple shaders:
-  - `circle_mask_compute.wgsl`: Changed `_pad: vec3<f32>` to three `f32` fields
-  - `fade_effect.cc`: Changed EffectParams padding from `vec3<f32>` to `_pad0/1/2: f32`
-  - `theme_modulation_effect.cc`: Same padding fix for EffectParams
-  - Fixed ODR violation in `demo_effects.h` (incomplete FadeEffect forward declaration)
-  - Renamed shadowing `uniforms_` members to `common_uniforms_`/`flash_uniforms_`
-  - Result: demo64k runs without crashes, 33/33 tests passing (100%)
-
-## Previously Completed (February 8, 2026)
-
-- [x] **Shader Parametrization System**: Full uniform parameter system with .seq syntax support. FlashEffect now supports color/decay parameters with per-frame animation. See `COMPLETED.md` for details.
-- [x] **ChromaAberrationEffect Parametrization**: Added offset_scale and angle parameters. Supports diagonal and vertical aberration modes via .seq syntax.
-- [x] **GaussianBlurEffect Parametrization**: Added strength parameter. Replaces hardcoded blur radius with configurable value.
-
----
-
-## Priority 1: Uniform Buffer Alignment (Task #74) [COMPLETED - February 9, 2026]
-
-**Goal**: Fix WebGPU uniform buffer size/padding/alignment mismatches between C++ structs and WGSL shaders.
-
-**Root Cause**: WGSL `vec3<f32>` has 16-byte alignment (not 12), causing struct padding mismatches. Using `vec3<f32>` for padding fields created unpredictable struct sizes.
-
-**Fixes Applied**:
-- `circle_mask_compute.wgsl`: Changed `_pad: vec3<f32>` to three separate `f32` fields
-  - Before: 24+ bytes in WGSL, 16 bytes in C++
-  - After: 16 bytes in both
-- Verified all shaders use individual `f32` fields for padding (no `vec3` in padding)
-
-**Results**:
-- ✅ demo64k: Runs with **0 WebGPU validation errors**
-- ✅ Test suite: **32/33 tests passing (97%)**
-- ❌ DemoEffectsTest: SEGFAULT in wgpu_native library (unrelated to alignment fixes)
-
-**Key Lesson**: Never use `vec3<f32>` for padding in WGSL uniform structs. Always use individual `f32` fields to ensure predictable alignment.
-
----
-
-## Priority 1: WGSL Uniform Buffer Validation & Consolidation (Task #75)
-
-**Goal**: Prevent alignment bugs by consolidating uniform buffer patterns and creating automated validation.
-
-**Background**: Recent bugs (Task #74) revealed WGSL `vec3<f32>` alignment issues causing 16-byte padding where 12 bytes expected. Need systematic approach to prevent recurrence.
-
-**Attack Plan**:
-
-### Phase 1: Audit & Document (1-2 hours)
-- [ ] **1.1**: Audit all WGSL shaders for uniform struct definitions
-  - List all uniform structs, their sizes, and padding strategies
-  - Identify inconsistencies (vec3 padding vs individual f32 fields)
-  - Document in `doc/UNIFORM_BUFFER_GUIDELINES.md`
-- [ ] **1.2**: Audit C++ struct definitions (CommonPostProcessUniforms, etc.)
-  - Verify static_assert size checks exist for all uniform structs
-  - Check for missing size validation
-
-### Phase 2: Consolidation (2-3 hours)
-- [ ] **2.1**: Standardize on CommonUniforms pattern
-  - All post-process effects should use CommonPostProcessUniforms for binding 2
-  - Effect-specific params at binding 3 (16 or 32 bytes, properly padded)
-- [ ] **2.2**: Eliminate `vec3<f32>` in padding fields
-  - Replace all `_pad: vec3<f32>` with `_pad0/1/2: f32`
-  - Apply to: FadeEffect, ThemeModulationEffect, any other effects
-- [ ] **2.3**: Add C++ wrapper structs with static_assert
-  - Every WGSL uniform struct should have matching C++ struct
-  - All structs require `static_assert(sizeof(...) == EXPECTED_SIZE)`
-
-### Phase 3: Validation Tool (3-4 hours)
-- [ ] **3.1**: Create `tools/validate_uniforms.py`
-  - Parse WGSL shader files for uniform struct definitions
-  - Calculate expected size using WGSL alignment rules:
-    - `f32`: 4-byte aligned
-    - `vec2<f32>`: 8-byte aligned
-    - `vec3<f32>`: **16-byte aligned** (not 12!)
-    - `vec4<f32>`: 16-byte aligned
-    - Struct size: rounded to largest member alignment
-- [ ] **3.2**: Parse C++ headers for matching structs
-  - Extract `sizeof()` from static_assert statements
-  - Match WGSL struct names to C++ struct names
-- [ ] **3.3**: Report mismatches
-  - Exit non-zero if C++ size != WGSL size
-  - Print detailed alignment breakdown for debugging
-- [ ] **3.4**: Integrate into CI/build system
-  - Add CMake custom command to run validation
-  - Fail build if validation fails (development builds only)
-  - Add to `scripts/check_all.sh`
-
-### Phase 4: Documentation (1 hour)
-- [ ] **4.1**: Write `doc/UNIFORM_BUFFER_GUIDELINES.md`
-  - Explain WGSL alignment rules (with examples)
-  - Document standard patterns (CommonUniforms, effect params)
-  - Show correct padding techniques
-  - Add examples of common mistakes
-- [ ] **4.2**: Update CONTRIBUTING.md
-  - Add "Uniform Buffer Checklist" section
-  - Require validation tool passes before commit
-
-**Size Impact**: Negligible (consolidation may save 50-100 bytes)
-
-**Priority**: High (prevents entire class of subtle bugs)
-
-**Dependencies**: None
+**Note:** For completed tasks, see `doc/COMPLETED.md`.
 
 ---
 
 ## Priority 1: Spectral Brush Editor (Task #5) [IN PROGRESS]
 
-**Goal:** Create a web-based tool for procedurally tracing audio spectrograms. Replaces large `.spec` binary assets with tiny procedural C++ code (50-100× compression).
+**Goal:** Web-based tool for procedurally tracing audio spectrograms. Replaces large `.spec` binary assets with tiny procedural C++ code (50-100× compression).
 
 **Design Document:** See `doc/SPECTRAL_BRUSH_EDITOR.md` for complete architecture.
 
-**Core Concept: "Spectral Brush"**
-- **Central Curve** (Bezier): Traces time-frequency path through spectrogram
-- **Vertical Profile**: Shapes "brush stroke" around curve (Gaussian, Decaying Sinusoid, Noise)
+**Core Concept:** Bezier curves trace time-frequency paths. Gaussian profiles shape "brush strokes" around curves.
 
-**Workflow:**
-```
-.wav → Load in editor → Trace with Bezier curves → Export procedural_params.txt + C++ code
-```
+**Workflow:** `.wav` → Load in editor → Trace with Bezier curves → Export `procedural_params.txt` + C++ code
 
 ### Phase 1: C++ Runtime (Foundation)
-- [ ] **Files:** `src/audio/spectral_brush.h`, `src/audio/spectral_brush.cc`
+- [ ] Files: `src/audio/spectral_brush.h`, `src/audio/spectral_brush.cc`
 - [ ] Define API (`ProfileType`, `draw_bezier_curve()`, `evaluate_profile()`)
 - [ ] Implement linear Bezier interpolation
 - [ ] Implement Gaussian profile evaluation
-- [ ] Implement home-brew deterministic RNG (for future noise support)
+- [ ] Implement home-brew deterministic RNG
 - [ ] Add unit tests (`src/tests/test_spectral_brush.cc`)
 - [ ] **Deliverable:** Compiles, tests pass
 
 ### Phase 2: Editor Core
-- [ ] **Files:** `tools/spectral_editor/index.html`, `script.js`, `style.css`, `dct.js` (reuse from old editor)
+- [ ] Files: `tools/spectral_editor/index.html`, `script.js`, `style.css`, `dct.js`
 - [ ] HTML structure (canvas, controls, file input)
 - [ ] Canvas rendering (dual-layer: reference + procedural)
-- [ ] Bezier curve editor (click to place, drag to adjust, delete control points)
+- [ ] Bezier curve editor (click, drag, delete control points)
 - [ ] Profile controls (Gaussian sigma slider)
 - [ ] Real-time spectrogram rendering
 - [ ] Audio playback (IDCT → Web Audio API)
-- [ ] Undo/Redo system (action history with snapshots)
-- [ ] **Keyboard shortcuts:**
-  - Key '1': Play procedural sound
-  - Key '2': Play original .wav
-  - Space: Play/pause
-  - Ctrl+Z: Undo
-  - Ctrl+Shift+Z: Redo
-  - Delete: Remove control point
+- [ ] Undo/Redo system
+- [ ] Keyboard shortcuts (1=play procedural, 2=play original, Space, Ctrl+Z, Delete)
 - [ ] **Deliverable:** Interactive editor, can trace .wav files
 
 ### Phase 3: File I/O
@@ -164,178 +50,74 @@ This file tracks prioritized tasks with detailed attack plans.
 - [ ] Decaying sinusoid profile (metallic sounds)
 - [ ] Noise profile (textured sounds)
 - [ ] Composite profiles (add/subtract/multiply)
-- [ ] Multi-dimensional Bezier ({freq, amplitude, decay, ...})
-- [ ] Frequency snapping (snap to musical notes)
-- [ ] Generic `gen_from_params()` code generation
 
-**Design Decisions:**
-- Linear Bezier interpolation (Phase 1), cubic later
-- Soft parameter limits in UI (not enforced)
-- Home-brew RNG (small, deterministic)
-- Single function per sound (generic loader later)
-- Start with Bezier + Gaussian only
+**Design Decisions:** Linear Bezier (Phase 1), cubic later. Soft parameter limits. Home-brew RNG. Single function per sound initially.
 
 **Size Impact:** 50-100× compression (5 KB .spec → ~100 bytes C++ code)
 
 ---
 
 ## Priority 2: 3D System Enhancements (Task #18)
-**Goal:** Establish a pipeline for importing complex 3D scenes to replace hardcoded geometry. **Progress:** C++ pipeline for loading and processing object-specific data (like plane_distance) is now in place. Shader integration for SDFs is pending.
 
+**Goal:** Establish pipeline for importing complex 3D scenes to replace hardcoded geometry.
 
-## Priority 3: WGSL Modularization (Task #50) [RECURRENT]
+**Progress:** C++ pipeline for loading object-specific data (plane_distance) is in place. Shader integration for SDFs pending.
 
-**Goal**: Refactor `ShaderComposer` and WGSL assets to support granular, reusable snippets and `#include` directives. This is an ongoing task to maintain shader code hygiene as new features are added.
+---
 
+## Priority 3: WGSL Modularization (Task #50) [RECURRENT]
 
+**Goal:** Refactor `ShaderComposer` and WGSL assets to support granular, reusable snippets. Ongoing task for shader code hygiene.
 
-## Phase 2: Size Optimization (Final Goal)
+### Sub-task: Split common_uniforms.wgsl (Low Priority)
+**Current:** `common_uniforms.wgsl` contains 4 structs (CommonUniforms, GlobalUniforms, ObjectData, ObjectsBuffer)
 
-- [ ] **Task #34: Full STL Removal**: Replace all remaining `std::vector`, `std::map`, and `std::string` usage with custom minimal containers or C-style arrays to allow for CRT replacement. (Minimal Priority - deferred to end).
+**Goal:** Split into separate files:
+- `common_uniforms/common.wgsl` - CommonUniforms only
+- `common_uniforms/global.wgsl` - GlobalUniforms only
+- `common_uniforms/object.wgsl` - ObjectData + ObjectsBuffer
 
-- [ ] **Task #22: Windows Native Platform**: Replace GLFW with direct Win32 API calls for the final 64k push.
+**Benefit:** Shaders only include what they need, reducing compiled size
 
-- [ ] **Task #28: Spectrogram Quantization**: Research optimal frequency bin distribution and implement quantization.
+**Impact:** Minimal (most shaders only use CommonUniforms)
 
-- [ ] **Task #35: CRT Replacement**: investigation and implementation of CRT-free entry point.
+**Priority:** Low (nice-to-have)
 
-## Future Goals & Ideas (Untriaged)
+### Sub-task: Type-safe shader composition (Low Priority)
+**Problem:** Recurrent error of forgetting `ShaderComposer::Get().Compose({}, code)` and using raw `code` directly. Runtime error only (crashes demo, tests may pass).
 
-### Audio Tools
-- [ ] **Task #64: specplay Enhancements**: Extend audio analysis tool with new features
-  - **Priority 1**: Spectral visualization (ASCII art), waveform display, frequency analysis, dynamic range
-  - **Priority 2**: Diff mode (compare .wav vs .spec), batch mode (CSV report, find clipping)
-  - **Priority 3**: WAV export (.spec → .wav), normalization
-  - **Priority 4**: Spectral envelope, harmonic analysis, onset detection
-  - **Priority 5**: Interactive mode (seek, loop, volume control)
-  - See `tools/specplay_README.md` for detailed feature list
+**Solution:** Use strong typing to make it compile-time error:
+```cpp
+class ComposedShader {
+ private:
+  std::string code_;
+  friend class ShaderComposer;
+  explicit ComposedShader(std::string code) : code_(std::move(code)) {}
+ public:
+  const char* c_str() const { return code_.c_str(); }
+};
+```
+
+**Changes:**
+- `ShaderComposer::Compose()` returns `ComposedShader` instead of `std::string`
+- All shader creation functions take `const ComposedShader&` instead of `const char*`
+- Cannot pass raw string to shader functions (compile error)
 
-- [ ] **Task #65: Data-Driven Tempo Control**: Move tempo variation from code to data files
-  - **Current**: `g_tempo_scale` is hardcoded in `main.cc` with manual animation curves
-  - **Goal**: Define tempo curves in `.seq` or `.track` files for data-driven tempo control
-  - **Approach A**: Add TEMPO directive to `.seq` format
-    - Example: `TEMPO 0.0 1.0`, `TEMPO 10.0 2.0`, `TEMPO 20.0 1.0` (time, scale pairs)
-    - seq_compiler generates tempo curve array in timeline.cc
-  - **Approach B**: Add tempo column to music.track
-    - Each pattern trigger can specify tempo_scale override
-    - tracker_compiler generates tempo events in music_data.cc
-  - **Benefits**: Non-programmers can edit tempo, easier iteration, version control friendly
-  - **Priority**: Low (current hardcoded approach works, but less flexible)
+**Benefits:** Impossible to forget composition (type mismatch). Self-documenting API. Compile-time error.
 
-- [ ] **Task #67: DCT/FFT Performance Benchmarking**: Add timing measurements to audio tests
-  - **Goal**: Compare performance of different DCT/IDCT implementations
-  - **Location**: Add timing code to `test_dct.cc` or `test_fft.cc`
-  - **Measurements**:
-    - Reference IDCT/FDCT (naive O(N²) implementation)
-    - FFT-based DCT/IDCT (current O(N log N) implementation)
-    - Future x86_64 SIMD-optimized versions (when implemented)
-  - **Output Format**:
-    - Average time per transform (microseconds)
-    - Throughput (transforms per second)
-    - Speedup factor vs reference implementation
-  - **Test Sizes**: DCT_SIZE=512 (production), plus 128, 256, 1024 for scaling analysis
-  - **Implementation**:
-    - Use `std::chrono::high_resolution_clock` for timing
-    - Run each test 1000+ iterations to reduce noise
-    - Report min/avg/max times
-    - Guard with `#if !defined(STRIP_ALL)` to avoid production overhead
-  - **Benefits**: Quantify FFT speedup, validate SIMD optimizations, identify regressions
-  - **Priority**: Very Low (nice-to-have for future optimization work)
+**Trade-offs:** More verbose code. Small overhead (extra std::string copy, negligible).
 
-- [ ] **Task #69: Convert Audio Pipeline to Clipped Int16**: Use clipped int16 for all audio processing
-  - **Current**: Audio pipeline uses float32 throughout (generation, mixing, synthesis, output)
-  - **Goal**: Convert to clipped int16 for faster/easier processing and reduced memory footprint
-  - **Rationale**:
-    - Simpler arithmetic (no float operations)
-    - Smaller memory footprint (2 bytes vs 4 bytes per sample)
-    - Hardware-native format (most audio devices use int16)
-    - Eliminates float→int16 conversion at output stage
-    - Natural clipping behavior (overflow wraps/clips automatically)
-  - **Scope**:
-    - Output path: Definitely convert (backends, WAV dump)
-    - Synthesis: Consider keeping float32 for quality (IDCT produces float)
-    - Mixing: Could use int16 with proper overflow handling
-    - Asset storage: Already int16 in .spec files
-  - **Implementation Phases**:
-    1. **Phase 1: Output Only** (Minimal change, ~50 lines)
-       - Convert `synth_render()` output from float to int16
-       - Update `MiniaudioBackend` and `WavDumpBackend` to accept int16
-       - Keep all internal processing as float
-       - **Benefit**: Eliminates final conversion step
-    2. **Phase 2: Mixing Stage** (Moderate change, ~200 lines)
-       - Convert voice mixing to int16 arithmetic
-       - Add saturation/clipping logic
-       - Keep IDCT output as float, convert after synthesis
-       - **Benefit**: Faster mixing, reduced memory bandwidth
-    3. **Phase 3: Full Pipeline** (Large change, ~500+ lines)
-       - Convert spectrograms from float to int16 storage
-       - Modify IDCT to output int16 directly
-       - All synthesis in int16
-       - **Benefit**: Maximum size reduction and performance
-  - **Trade-offs**:
-    - Quality loss: 16-bit resolution vs 32-bit float precision
-    - Dynamic range: Limited to [-32768, 32767]
-    - Clipping: Must handle overflow carefully in mixing stage
-    - Code complexity: Saturation arithmetic more complex than float
-  - **Testing Requirements**:
-    - Verify no audible quality degradation
-    - Ensure clipping behavior matches float version
-    - Check mixing overflow doesn't cause artifacts
-    - Validate WAV dumps bit-identical to hardware output
-  - **Size Impact**:
-    - Phase 1: Negligible (~50 bytes)
-    - Phase 2: Small reduction (~100-200 bytes, faster code)
-    - Phase 3: Large reduction (50% memory, ~1-2KB code savings)
-  - **Priority**: Low (final optimization, after size budget is tight)
-  - **Notes**:
-    - This is a FINAL optimization task, only if 64k budget requires it
-    - Quality must be validated - may not be worth the trade-off
-    - Consider keeping float for procedural generation quality
+**Priority:** Low (recurrent but rare, easy to catch in testing)
 
-### Developer Tools
-- [ ] **Task #66: External Asset Loading for Debugging**: mmap() asset files instead of embedded data
-  - **Current**: All assets embedded in `assets_data.cc` (regenerate on every asset change)
-  - **Goal**: Load assets from external files in debug builds for faster iteration
-  - **Scope**: macOS only, non-STRIP_ALL builds only
-  - **Implementation**:
-    - Add `DEMO_ENABLE_EXTERNAL_ASSETS` CMake option
-    - Modify `GetAsset()` to check for external file first (e.g., `assets/final/<name>`)
-    - Use `mmap()` to map file into memory (replaces `uint8_t asset[]` array)
-    - Fallback to embedded data if file not found
-  - **Benefits**: Edit shaders/assets without regenerating assets_data.cc (~10s rebuild)
-  - **Trade-offs**: Adds runtime file I/O, only useful during development
-  - **Priority**: Low (current workflow acceptable, but nice-to-have for rapid iteration)
+---
 
-### Visual Effects
-- [ ] **Task #73: Extend Shader Parametrization** [IN PROGRESS - 2/4 complete]
-  - **Goal**: Extend uniform parameter system to ChromaAberrationEffect, GaussianBlurEffect, DistortEffect, SolarizeEffect
-  - **Pattern**: Follow FlashEffect implementation (UniformHelper, params struct, .seq syntax)
-  - **Completed**: ChromaAberrationEffect (offset_scale, angle), GaussianBlurEffect (strength)
-  - **Remaining**: DistortEffect, SolarizeEffect
-  - **Priority**: Medium (quality-of-life improvement for artists)
-  - **Estimated Impact**: ~200-300 bytes per effect
-- [ ] **Task #52: Procedural SDF Font**: Minimal bezier/spline set for [A-Z, 0-9] and SDF rendering.
-- [ ] **Task #55: SDF Random Planes Intersection**: Implement `sdPolyhedron` (crystal/gem shapes) via plane intersection.
-- [ ] **Task #54: Tracy Integration**: Integrate Tracy debugger for performance profiling.
-- [ ] **Task #58: Advanced Shader Factorization**: Further factorize WGSL code into smaller, reusable snippets.
-- [ ] **Task #59: Comprehensive RNG Library**: Add WGSL snippets for float/vec2/vec3 noise (Perlin, Gyroid, etc.) and random number generators.
-- [ ] **Task #60: OOP Refactoring**: Investigate if more C++ code can be made object-oriented without size penalty (vs functional style).
-- [ ] **Task #61: GPU Procedural Generation**: Implement system to generate procedural data (textures, geometry) on GPU and read back to CPU.
-- [ ] **Task #62: Physics Engine Enhancements (PBD & Rotation)**:
-    - [ ] **Task #62.1: Quaternion Rotation**: Implement quaternion-based rotation for `Object3D` and incorporate angular momentum into physics.
-    - [ ] **Task #62.2: Position Based Dynamics (PBD)**: Refactor solver to re-evaluate velocity after resolving all collisions and constraints.
-- [ ] **Task #63: Refactor large files**: Split `src/gpu/gpu.cc`, `src/3d/visual_debug.cc` and `src/gpu/effect.cc` into sub-functionalities. (`src/3d/renderer.cc` was also over 500 lines and was taken care of in the past)
+## Phase 2: Size Optimization (Final Goal)
 
-### Performance Optimization
-- [ ] **Task #70: SIMD x86_64 Implementation**: Implement critical functions using intrinsics for x86_64 platforms.
-  - **Goal**: Optimize hot paths for audio and procedural generation.
-  - **Scope**:
-    - IDCT/FDCT transforms
-    - Audio mixing and voice synthesis
-    - CPU-side procedural texture/geometry generation
-  - **Constraint**: Non-critical; fallback to generic C++ must be maintained.
-  - **Priority**: Very Low
+- [ ] **Task #34: Full STL Removal** - Replace remaining `std::vector`, `std::map`, `std::string` with custom containers
+- [ ] **Task #22: Windows Native Platform** - Replace GLFW with Win32 API
+- [ ] **Task #28: Spectrogram Quantization** - Research optimal frequency distribution
+- [ ] **Task #35: CRT Replacement** - Investigation and implementation of CRT-free entry
 
 ---
 
-## Future Goals
-\ No newline at end of file
+For untriaged future goals and ideas, see `doc/BACKLOG.md`.
diff --git a/assets/demo.seq b/assets/demo.seq
index 0dfb108..a8717f3 100644
--- a/assets/demo.seq
+++ b/assets/demo.seq
@@ -30,9 +30,10 @@ SEQUENCE 0b 0
   EFFECT + VignetteEffect 0 6 radius=0.6 softness=0.1
 
 SEQUENCE 2.0 0
-  EFFECT + CircleMaskEffect 0.0 2.0 0.35        # Priority 0 (mask generator, radius 0.35)
-  EFFECT + RotatingCubeEffect 0.0 2.0           # Priority 1 (renders inside circle)
-  EFFECT + GaussianBlurEffect 0.0 2.0 strength=2.0  # Priority 2 (post-process blur)
+  EFFECT + CircleMaskEffect 0.0 4.0 0.50        # Priority 0 mask generator
+  EFFECT + RotatingCubeEffect 0.0 4.0           # Priority 1 (renders inside circle)
+  EFFECT + GaussianBlurEffect 1.0 2.0 strength=1.0
+  EFFECT + GaussianBlurEffect 3.0 4.0 strength=2.0
 
 SEQUENCE 4b 0
   EFFECT - FlashCubeEffect 0.1 3.     # Priority -1
diff --git a/assets/final/demo_assets.txt b/assets/final/demo_assets.txt
index 05eee17..96f86f9 100644
--- a/assets/final/demo_assets.txt
+++ b/assets/final/demo_assets.txt
@@ -1,22 +1,22 @@
 # Asset Name, Compression Type, Filename/Placeholder, Description
 
 # --- Drum & Percussion Samples ---
-KICK_1, NONE, KICK_606.spec, "606 Kick"
-KICK_2, NONE, KICK_90S_2.spec, "90s Kick"
-SNARE_1, NONE, SNARE_808.spec, "808 Snare"
-SNARE_2, NONE, SNARE_909_TUNE_8.spec, "909 Snare"
-SNARE_3, NONE, SNARE_BLUE_ROOM.spec, "Snare Blue Room"
-HIHAT_1, NONE, HIHAT_CLOSED_DMX.spec, "DMX Closed Hi-hat"
-HIHAT_2, NONE, HIHAT_CLOSED_DUFF.spec, "Duff Closed Hi-hat"
-HIHAT_3, NONE, HIHAT_CLOSED_ER_1.spec, "ER-1 Closed Hi-hat"
-CRASH_1, NONE, CRASH_DMX.spec, "DMX Crash"
-RIDE_1, NONE, RIDE_CUP_1.spec, "Ride Cymbal"
-SPLASH_1, NONE, SPLASH_GROUNDED.spec, "Splash Cymbal"
+KICK_1, NONE, music/KICK_606.spec, "606 Kick"
+KICK_2, NONE, music/KICK_90S_2.spec, "90s Kick"
+SNARE_1, NONE, music/SNARE_808.spec, "808 Snare"
+SNARE_2, NONE, music/SNARE_909_TUNE_8.spec, "909 Snare"
+SNARE_3, NONE, music/SNARE_BLUE_ROOM.spec, "Snare Blue Room"
+HIHAT_1, NONE, music/HIHAT_CLOSED_DMX.spec, "DMX Closed Hi-hat"
+HIHAT_2, NONE, music/HIHAT_CLOSED_DUFF.spec, "Duff Closed Hi-hat"
+HIHAT_3, NONE, music/HIHAT_CLOSED_ER_1.spec, "ER-1 Closed Hi-hat"
+CRASH_1, NONE, music/CRASH_DMX.spec, "DMX Crash"
+RIDE_1, NONE, music/RIDE_CUP_1.spec, "Ride Cymbal"
+SPLASH_1, NONE, music/SPLASH_GROUNDED.spec, "Splash Cymbal"
 
 # --- Melodic Samples ---
-BASS_1, NONE, BASS_GUITAR_FEEL.spec, "Bass Guitar"
-BASS_2, NONE, BASS_SYNTH_1.spec, "Synth Bass 1"
-BASS_3, NONE, SYNTH_BASS_DISTORT.spec, "Distorted Synth Bass"
+BASS_1, NONE, music/BASS_GUITAR_FEEL.spec, "Bass Guitar"
+BASS_2, NONE, music/BASS_SYNTH_1.spec, "Synth Bass 1"
+BASS_3, NONE, music/SYNTH_BASS_DISTORT.spec, "Distorted Synth Bass"
 
 # --- Procedural Textures ---
 NOISE_TEX, PROC(gen_noise, 1234, 16), _, "Procedural noise texture for bump mapping"
@@ -52,6 +52,11 @@ SHADER_MESH, NONE, shaders/mesh_render.wgsl, "Mesh Rasterization Shader"
 MESH_CUBE, NONE, test_mesh.obj, "A simple cube mesh"
 DODECAHEDRON, NONE, dodecahedron.obj, "A dodecahedron mesh"
 SHADER_VIGNETTE, NONE, shaders/vignette.wgsl, "Vignette Shader"
+SHADER_COMPUTE_GEN_NOISE, NONE, shaders/compute/gen_noise.wgsl, "GPU Noise Compute Shader"
+SHADER_COMPUTE_GEN_PERLIN, NONE, shaders/compute/gen_perlin.wgsl, "GPU Perlin Noise Compute Shader"
+SHADER_COMPUTE_GEN_GRID, NONE, shaders/compute/gen_grid.wgsl, "GPU Grid Compute Shader"
+SHADER_COMPUTE_GEN_BLEND, NONE, shaders/compute/gen_blend.wgsl, "GPU Blend Composite Shader"
+SHADER_COMPUTE_GEN_MASK, NONE, shaders/compute/gen_mask.wgsl, "GPU Mask Composite Shader"
 CIRCLE_MASK_COMPUTE_SHADER, NONE, shaders/circle_mask_compute.wgsl, "Circle mask compute shader"
 CIRCLE_MASK_RENDER_SHADER, NONE, shaders/circle_mask_render.wgsl, "Circle mask render shader"
 MASKED_CUBE_SHADER, NONE, shaders/masked_cube.wgsl, "Masked cube shader"
diff --git a/assets/final/BASS_GUITAR_FEEL.spec b/assets/final/music/BASS_GUITAR_FEEL.spec
index 54f49a6..54f49a6 100644
--- a/assets/final/BASS_GUITAR_FEEL.spec
+++ b/assets/final/music/BASS_GUITAR_FEEL.spec
diff --git a/assets/final/BASS_SYNTH_1.spec b/assets/final/music/BASS_SYNTH_1.spec
index 33bc0a0..33bc0a0 100644
--- a/assets/final/BASS_SYNTH_1.spec
+++ b/assets/final/music/BASS_SYNTH_1.spec
diff --git a/assets/final/CRASH_DMX.spec b/assets/final/music/CRASH_DMX.spec
index 45ee52d..45ee52d 100644
--- a/assets/final/CRASH_DMX.spec
+++ b/assets/final/music/CRASH_DMX.spec
diff --git a/assets/final/HIHAT_CLOSED_DMX.spec b/assets/final/music/HIHAT_CLOSED_DMX.spec
index 8fce1d2..8fce1d2 100644
--- a/assets/final/HIHAT_CLOSED_DMX.spec
+++ b/assets/final/music/HIHAT_CLOSED_DMX.spec
diff --git a/assets/final/HIHAT_CLOSED_DUFF.spec b/assets/final/music/HIHAT_CLOSED_DUFF.spec
index f738271..f738271 100644
--- a/assets/final/HIHAT_CLOSED_DUFF.spec
+++ b/assets/final/music/HIHAT_CLOSED_DUFF.spec
diff --git a/assets/final/HIHAT_CLOSED_ER_1.spec b/assets/final/music/HIHAT_CLOSED_ER_1.spec
index bb03f5e..bb03f5e 100644
--- a/assets/final/HIHAT_CLOSED_ER_1.spec
+++ b/assets/final/music/HIHAT_CLOSED_ER_1.spec
diff --git a/assets/final/KICK_606.spec b/assets/final/music/KICK_606.spec
index 10af84a..10af84a 100644
--- a/assets/final/KICK_606.spec
+++ b/assets/final/music/KICK_606.spec
diff --git a/assets/final/KICK_90S_2.spec b/assets/final/music/KICK_90S_2.spec
index 126409e..126409e 100644
--- a/assets/final/KICK_90S_2.spec
+++ b/assets/final/music/KICK_90S_2.spec
diff --git a/assets/final/RIDE_CUP_1.spec b/assets/final/music/RIDE_CUP_1.spec
index 78867c1..78867c1 100644
--- a/assets/final/RIDE_CUP_1.spec
+++ b/assets/final/music/RIDE_CUP_1.spec
diff --git a/assets/final/SNARE_808.spec b/assets/final/music/SNARE_808.spec
index 2923c3c..2923c3c 100644
--- a/assets/final/SNARE_808.spec
+++ b/assets/final/music/SNARE_808.spec
diff --git a/assets/final/SNARE_909_TUNE_8.spec b/assets/final/music/SNARE_909_TUNE_8.spec
index 4693c05..4693c05 100644
--- a/assets/final/SNARE_909_TUNE_8.spec
+++ b/assets/final/music/SNARE_909_TUNE_8.spec
diff --git a/assets/final/SNARE_BLUE_ROOM.spec b/assets/final/music/SNARE_BLUE_ROOM.spec
index b24baa7..b24baa7 100644
--- a/assets/final/SNARE_BLUE_ROOM.spec
+++ b/assets/final/music/SNARE_BLUE_ROOM.spec
diff --git a/assets/final/SPLASH_GROUNDED.spec b/assets/final/music/SPLASH_GROUNDED.spec
index a919be4..a919be4 100644
--- a/assets/final/SPLASH_GROUNDED.spec
+++ b/assets/final/music/SPLASH_GROUNDED.spec
diff --git a/assets/final/SYNTH_BASS_DISTORT.spec b/assets/final/music/SYNTH_BASS_DISTORT.spec
index 33bc0a0..33bc0a0 100644
--- a/assets/final/SYNTH_BASS_DISTORT.spec
+++ b/assets/final/music/SYNTH_BASS_DISTORT.spec
diff --git a/assets/final/shaders/chroma_aberration.wgsl b/assets/final/shaders/chroma_aberration.wgsl
index bad3624..6c942b7 100644
--- a/assets/final/shaders/chroma_aberration.wgsl
+++ b/assets/final/shaders/chroma_aberration.wgsl
@@ -1,22 +1,14 @@
 @group(0) @binding(0) var smplr: sampler;
 @group(0) @binding(1) var txt: texture_2d<f32>;
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
-struct EffectParams {
+#include "common_uniforms"
+struct ChromaAberrationParams {
     offset_scale: f32,
     angle: f32,
 };
 
 @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
-@group(0) @binding(3) var<uniform> params: EffectParams;
+@group(0) @binding(3) var<uniform> params: ChromaAberrationParams;
 
 @vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {
     var pos = array<vec2<f32>, 3>(
diff --git a/assets/final/shaders/circle_mask_compute.wgsl b/assets/final/shaders/circle_mask_compute.wgsl
index 1ed6c1e..484d3dd 100644
--- a/assets/final/shaders/circle_mask_compute.wgsl
+++ b/assets/final/shaders/circle_mask_compute.wgsl
@@ -1,16 +1,8 @@
 // Circle mask compute shader
 // Generates a circular mask (1.0 inside, 0.0 outside)
 
-struct CommonUniforms {
-  resolution: vec2<f32>,
-  _pad0: f32,
-  _pad1: f32,
-  aspect_ratio: f32,
-  time: f32,
-  beat: f32,
-  audio_intensity: f32,
-};
-struct EffectParams {
+#include "common_uniforms"
+struct CircleMaskParams {
   radius: f32,
   _pad0: f32,
   _pad1: f32,
@@ -18,7 +10,7 @@ struct EffectParams {
 };
 
 @group(0) @binding(0) var<uniform> uniforms: CommonUniforms;
-@group(0) @binding(1) var<uniform> params: EffectParams;
+@group(0) @binding(1) var<uniform> params: CircleMaskParams;
 
 struct VSOutput {
   @builtin(position) position: vec4<f32>,
diff --git a/assets/final/shaders/circle_mask_render.wgsl b/assets/final/shaders/circle_mask_render.wgsl
index ce98f9c..cfa002e 100644
--- a/assets/final/shaders/circle_mask_render.wgsl
+++ b/assets/final/shaders/circle_mask_render.wgsl
@@ -4,15 +4,7 @@
 @group(0) @binding(0) var mask_tex: texture_2d<f32>;
 @group(0) @binding(1) var mask_sampler: sampler;
 
-struct CommonUniforms {
-  resolution: vec2<f32>,
-  _pad0: f32,
-  _pad1: f32,
-  aspect_ratio: f32,
-  time: f32,
-  beat: f32,
-  audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
 
diff --git a/assets/final/shaders/compute/gen_blend.wgsl b/assets/final/shaders/compute/gen_blend.wgsl
new file mode 100644
index 0000000..9fc9e1e
--- /dev/null
+++ b/assets/final/shaders/compute/gen_blend.wgsl
@@ -0,0 +1,29 @@
+// This file is part of the 64k demo project.
+// GPU composite shader: Blend two textures.
+
+struct BlendParams {
+  width: u32,
+  height: u32,
+  blend_factor: f32,
+  _pad0: f32,
+}
+
+@group(0) @binding(0) var output_tex: texture_storage_2d<rgba8unorm, write>;
+@group(0) @binding(1) var<uniform> params: BlendParams;
+@group(0) @binding(2) var input_a: texture_2d<f32>;
+@group(0) @binding(3) var input_b: texture_2d<f32>;
+@group(0) @binding(4) var tex_sampler: sampler;
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) id: vec3<u32>) {
+  if (id.x >= params.width || id.y >= params.height) { return; }
+
+  let uv = vec2<f32>(f32(id.x) / f32(params.width),
+                      f32(id.y) / f32(params.height));
+
+  let color_a = textureSampleLevel(input_a, tex_sampler, uv, 0.0);
+  let color_b = textureSampleLevel(input_b, tex_sampler, uv, 0.0);
+  let blended = mix(color_a, color_b, params.blend_factor);
+
+  textureStore(output_tex, id.xy, blended);
+}
diff --git a/assets/final/shaders/compute/gen_grid.wgsl b/assets/final/shaders/compute/gen_grid.wgsl
new file mode 100644
index 0000000..cc5e189
--- /dev/null
+++ b/assets/final/shaders/compute/gen_grid.wgsl
@@ -0,0 +1,24 @@
+// GPU procedural grid pattern generator.
+// Simple grid lines with configurable spacing and thickness.
+
+struct GridParams {
+  width: u32,
+  height: u32,
+  grid_size: u32,
+  thickness: u32,
+}
+
+@group(0) @binding(0) var output_tex: texture_storage_2d<rgba8unorm, write>;
+@group(0) @binding(1) var<uniform> params: GridParams;
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) id: vec3<u32>) {
+  if (id.x >= params.width || id.y >= params.height) { return; }
+
+  let on_line = (id.x % params.grid_size) < params.thickness ||
+                (id.y % params.grid_size) < params.thickness;
+
+  let val = select(0.0, 1.0, on_line);
+
+  textureStore(output_tex, id.xy, vec4<f32>(val, val, val, 1.0));
+}
diff --git a/assets/final/shaders/compute/gen_mask.wgsl b/assets/final/shaders/compute/gen_mask.wgsl
new file mode 100644
index 0000000..1ce9f52
--- /dev/null
+++ b/assets/final/shaders/compute/gen_mask.wgsl
@@ -0,0 +1,27 @@
+// This file is part of the 64k demo project.
+// GPU composite shader: Multiply texture A by texture B (masking).
+
+struct MaskParams {
+  width: u32,
+  height: u32,
+}
+
+@group(0) @binding(0) var output_tex: texture_storage_2d<rgba8unorm, write>;
+@group(0) @binding(1) var<uniform> params: MaskParams;
+@group(0) @binding(2) var input_a: texture_2d<f32>;
+@group(0) @binding(3) var input_b: texture_2d<f32>;
+@group(0) @binding(4) var tex_sampler: sampler;
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) id: vec3<u32>) {
+  if (id.x >= params.width || id.y >= params.height) { return; }
+
+  let uv = vec2<f32>(f32(id.x) / f32(params.width),
+                      f32(id.y) / f32(params.height));
+
+  let color_a = textureSampleLevel(input_a, tex_sampler, uv, 0.0);
+  let mask_b = textureSampleLevel(input_b, tex_sampler, uv, 0.0);
+  let masked = color_a * mask_b;
+
+  textureStore(output_tex, id.xy, masked);
+}
diff --git a/assets/final/shaders/compute/gen_noise.wgsl b/assets/final/shaders/compute/gen_noise.wgsl
new file mode 100644
index 0000000..5c0babd
--- /dev/null
+++ b/assets/final/shaders/compute/gen_noise.wgsl
@@ -0,0 +1,26 @@
+// GPU procedural noise texture generator.
+// Uses compute shader for parallel texture generation.
+
+#include "math/noise"
+
+struct NoiseParams {
+  width: u32,
+  height: u32,
+  seed: f32,
+  frequency: f32,
+}
+
+@group(0) @binding(0) var output_tex: texture_storage_2d<rgba8unorm, write>;
+@group(0) @binding(1) var<uniform> params: NoiseParams;
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) id: vec3<u32>) {
+  if (id.x >= params.width || id.y >= params.height) { return; }
+
+  let uv = vec2<f32>(f32(id.x) / f32(params.width),
+                      f32(id.y) / f32(params.height));
+  let p = uv * params.frequency + params.seed;
+  let noise = noise_2d(p);
+
+  textureStore(output_tex, id.xy, vec4<f32>(noise, noise, noise, 1.0));
+}
diff --git a/assets/final/shaders/compute/gen_perlin.wgsl b/assets/final/shaders/compute/gen_perlin.wgsl
new file mode 100644
index 0000000..73816d6
--- /dev/null
+++ b/assets/final/shaders/compute/gen_perlin.wgsl
@@ -0,0 +1,44 @@
+// GPU procedural Perlin noise texture generator.
+// Fractional Brownian Motion using value noise.
+
+#include "math/noise"
+
+struct PerlinParams {
+  width: u32,
+  height: u32,
+  seed: f32,
+  frequency: f32,
+  amplitude: f32,
+  amplitude_decay: f32,
+  octaves: u32,
+  _pad0: f32,  // Padding for alignment
+}
+
+@group(0) @binding(0) var output_tex: texture_storage_2d<rgba8unorm, write>;
+@group(0) @binding(1) var<uniform> params: PerlinParams;
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) id: vec3<u32>) {
+  if (id.x >= params.width || id.y >= params.height) { return; }
+
+  let uv = vec2<f32>(f32(id.x) / f32(params.width),
+                      f32(id.y) / f32(params.height));
+
+  var value = 0.0;
+  var amplitude = params.amplitude;
+  var frequency = params.frequency;
+  var total_amp = 0.0;
+
+  for (var o: u32 = 0u; o < params.octaves; o++) {
+    let p = uv * frequency + params.seed;
+    value += noise_2d(p) * amplitude;
+    total_amp += amplitude;
+    frequency *= 2.0;
+    amplitude *= params.amplitude_decay;
+  }
+
+  value /= total_amp;
+  let clamped = clamp(value, 0.0, 1.0);
+
+  textureStore(output_tex, id.xy, vec4<f32>(clamped, clamped, clamped, 1.0));
+}
diff --git a/assets/final/shaders/distort.wgsl b/assets/final/shaders/distort.wgsl
index cca01c4..5d35129 100644
--- a/assets/final/shaders/distort.wgsl
+++ b/assets/final/shaders/distort.wgsl
@@ -1,15 +1,15 @@
 @group(0) @binding(0) var smplr: sampler;
 @group(0) @binding(1) var txt: texture_2d<f32>;
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
+#include "common_uniforms"
+
+struct DistortParams {
+    strength: f32,
+    speed: f32,
 };
 
 @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
+@group(0) @binding(3) var<uniform> params: DistortParams;
 
 @vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {
     var pos = array<vec2<f32>, 3>(
@@ -22,6 +22,6 @@ struct CommonUniforms {
 
 @fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
     let uv = p.xy / uniforms.resolution;
-    let dist = 0.1 * uniforms.audio_intensity * sin(uv.y * 20.0 + uniforms.time * 5.0);
+    let dist = params.strength * uniforms.audio_intensity * sin(uv.y * 20.0 + uniforms.time * params.speed * 5.0);
     return textureSample(txt, smplr, uv + vec2<f32>(dist, 0.0));
 }
diff --git a/assets/final/shaders/ellipse.wgsl b/assets/final/shaders/ellipse.wgsl
index 9c6b0d9..05dfcfc 100644
--- a/assets/final/shaders/ellipse.wgsl
+++ b/assets/final/shaders/ellipse.wgsl
@@ -1,12 +1,4 @@
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(0) var<uniform> uniforms: CommonUniforms;
 
diff --git a/assets/final/shaders/gaussian_blur.wgsl b/assets/final/shaders/gaussian_blur.wgsl
index 3b87b10..02156f7 100644
--- a/assets/final/shaders/gaussian_blur.wgsl
+++ b/assets/final/shaders/gaussian_blur.wgsl
@@ -1,22 +1,14 @@
 @group(0) @binding(0) var smplr: sampler;
 @group(0) @binding(1) var txt: texture_2d<f32>;
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
-struct EffectParams {
+#include "common_uniforms"
+struct GaussianBlurParams {
     strength: f32,
     _pad: f32,
 };
 
 @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
-@group(0) @binding(3) var<uniform> params: EffectParams;
+@group(0) @binding(3) var<uniform> params: GaussianBlurParams;
 
 @vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {
     var pos = array<vec2<f32>, 3>(
diff --git a/assets/final/shaders/main_shader.wgsl b/assets/final/shaders/main_shader.wgsl
index 7155a6d..ab0278c 100644
--- a/assets/final/shaders/main_shader.wgsl
+++ b/assets/final/shaders/main_shader.wgsl
@@ -1,12 +1,4 @@
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(0) var<uniform> uniforms: CommonUniforms;
 
diff --git a/assets/final/shaders/particle_compute.wgsl b/assets/final/shaders/particle_compute.wgsl
index 38a95e1..ae513c8 100644
--- a/assets/final/shaders/particle_compute.wgsl
+++ b/assets/final/shaders/particle_compute.wgsl
@@ -5,15 +5,7 @@ struct Particle {
     color: vec4<f32>,
 };
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(0) var<storage, read_write> particles: array<Particle>;
 @group(0) @binding(1) var<uniform> uniforms: CommonUniforms;
diff --git a/assets/final/shaders/particle_render.wgsl b/assets/final/shaders/particle_render.wgsl
index 9030a3a..6a2b636 100644
--- a/assets/final/shaders/particle_render.wgsl
+++ b/assets/final/shaders/particle_render.wgsl
@@ -5,15 +5,7 @@ struct Particle {
     color: vec4<f32>,
 };
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(0) var<storage, read> particles: array<Particle>;
 @group(0) @binding(1) var<uniform> uniforms: CommonUniforms;
diff --git a/assets/final/shaders/particle_spray_compute.wgsl b/assets/final/shaders/particle_spray_compute.wgsl
index b165971..a4041f2 100644
--- a/assets/final/shaders/particle_spray_compute.wgsl
+++ b/assets/final/shaders/particle_spray_compute.wgsl
@@ -5,15 +5,7 @@ struct Particle {
     color: vec4<f32>,
 };
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(0) var<storage, read_write> particles: array<Particle>;
 @group(0) @binding(1) var<uniform> uniforms: CommonUniforms;
diff --git a/assets/final/shaders/passthrough.wgsl b/assets/final/shaders/passthrough.wgsl
index dfdacf4..266e231 100644
--- a/assets/final/shaders/passthrough.wgsl
+++ b/assets/final/shaders/passthrough.wgsl
@@ -1,15 +1,7 @@
 @group(0) @binding(0) var smplr: sampler;
 @group(0) @binding(1) var txt: texture_2d<f32>;
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
 
 @vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {
diff --git a/assets/final/shaders/solarize.wgsl b/assets/final/shaders/solarize.wgsl
index 645fb9a..de15dfc 100644
--- a/assets/final/shaders/solarize.wgsl
+++ b/assets/final/shaders/solarize.wgsl
@@ -1,15 +1,7 @@
 @group(0) @binding(0) var smplr: sampler;
 @group(0) @binding(1) var txt: texture_2d<f32>;
 
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};
+#include "common_uniforms"
 
 @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
 
diff --git a/assets/final/shaders/vignette.wgsl b/assets/final/shaders/vignette.wgsl
index 4b096d7..b129883 100644
--- a/assets/final/shaders/vignette.wgsl
+++ b/assets/final/shaders/vignette.wgsl
@@ -1,20 +1,14 @@
 @group(0) @binding(0) var input_sampler: sampler;
 @group(0) @binding(1) var input_tex: texture_2d<f32>;
-struct CommonUniforms {
-    resolution: vec2<f32>,
-    _pad0: f32,
-    _pad1: f32,
-    aspect_ratio: f32,
-    time: f32,
-    beat: f32,
-    audio_intensity: f32,
-};struct EffectParams {
+#include "common_uniforms"
+
+struct VignetteParams {
   radius: f32,
   softness: f32,
 };
 
 @group(0) @binding(2) var<uniform> common_uniforms: CommonUniforms;
-@group(0) @binding(3) var<uniform> params: EffectParams;
+@group(0) @binding(3) var<uniform> params: VignetteParams;
 
 @vertex
 fn vs_main(@builtin(vertex_index) vertex_idx: u32) -> @builtin(position) vec4<f32> {
diff --git a/assets/final/test_demo_assets.txt b/assets/final/test_demo_assets.txt
index d679237..dec8625 100644
--- a/assets/final/test_demo_assets.txt
+++ b/assets/final/test_demo_assets.txt
@@ -1,3 +1,3 @@
-KICK_1, NONE, KICK_606.spec, "606 Kick"
-SNARE_1, NONE, SNARE_808.spec, "808 Snare"
-CRASH_1, NONE, CRASH_DMX.spec, "DMX Crash"
+KICK_1, NONE, music/KICK_606.spec, "606 Kick"
+SNARE_1, NONE, music/SNARE_808.spec, "808 Snare"
+CRASH_1, NONE, music/CRASH_DMX.spec, "DMX Crash"
diff --git a/doc/ARCHITECTURE.md b/doc/ARCHITECTURE.md
new file mode 100644
index 0000000..1a32300
--- /dev/null
+++ b/doc/ARCHITECTURE.md
@@ -0,0 +1,60 @@
+# Architectural Overview
+
+Detailed system architecture for the 64k demo project.
+
+---
+
+## Hybrid 3D Renderer
+
+**Core Idea**: Uses standard rasterization to draw proxy hulls (boxes), then raymarches inside the fragment shader to find the exact SDF surface.
+
+**Transforms**: Uses `inv_model` matrices to perform all raymarching in local object space, handling rotation and non-uniform scaling correctly.
+
+**Shadows**: Instance-based shadow casting with self-shadowing prevention (`skip_idx`).
+
+---
+
+## Sequence & Effect System
+
+**Effect**: Abstract base for visual elements. Supports `compute` and `render` phases.
+
+**Sequence**: Timeline of effects with start/end times.
+
+**MainSequence**: Top-level coordinator and framebuffer manager.
+
+**seq_compiler**: Transpiles `assets/demo.seq` into C++ `timeline.cc`.
+
+---
+
+## Asset & Build System
+
+**asset_packer**: Embeds binary assets (like `.spec` files) into C++ arrays.
+
+**Runtime Manager**: O(1) retrieval with lazy procedural generation support.
+
+**Automation**: `gen_assets.sh`, `build_win.sh`, and `check_all.sh` for multi-platform validation.
+
+---
+
+## Audio Engine
+
+### Synthesis
+Real-time additive synthesis from spectrograms via FFT-based IDCT (O(N log N)). Stereo output (32kHz, 16-bit, interleaved L/R). Uses orthonormal DCT-II/DCT-III transforms with Numerical Recipes reordering method.
+
+### Variable Tempo
+Music time abstraction with configurable tempo_scale. Tempo changes don't affect pitch.
+
+### Event-Based Tracker
+Individual TrackerEvents trigger as separate voices with dynamic beat calculation. Notes within patterns respect tempo scaling.
+
+### Backend Abstraction
+`AudioBackend` interface with `MiniaudioBackend` (production), `MockAudioBackend` (testing), and `WavDumpBackend` (offline rendering).
+
+### Dynamic Updates
+Double-buffered spectrograms for live thread-safe updates.
+
+### Procedural Library
+Melodies and spectral filters (noise, comb) generated at runtime.
+
+### Pattern System
+TrackerPatterns contain lists of TrackerEvents (beat, sample_id, volume, pan). Events trigger individually based on elapsed music time.
diff --git a/doc/BACKLOG.md b/doc/BACKLOG.md
new file mode 100644
index 0000000..403ecc9
--- /dev/null
+++ b/doc/BACKLOG.md
@@ -0,0 +1,197 @@
+# Future Goals & Ideas (Untriaged)
+
+This file contains low-priority tasks and ideas that have not yet been triaged for active development.
+
+---
+
+## Audio Tools
+
+### Task #64: specplay Enhancements
+Extend audio analysis tool with new features:
+- **Priority 1**: Spectral visualization (ASCII art), waveform display, frequency analysis, dynamic range
+- **Priority 2**: Diff mode (compare .wav vs .spec), batch mode (CSV report, find clipping)
+- **Priority 3**: WAV export (.spec → .wav), normalization
+- **Priority 4**: Spectral envelope, harmonic analysis, onset detection
+- **Priority 5**: Interactive mode (seek, loop, volume control)
+
+See `tools/specplay_README.md` for detailed feature list.
+
+### Task #65: Data-Driven Tempo Control
+Move tempo variation from code to data files.
+
+**Current**: `g_tempo_scale` is hardcoded in `main.cc` with manual animation curves
+
+**Goal**: Define tempo curves in `.seq` or `.track` files
+
+**Approach A**: Add TEMPO directive to `.seq` format
+- Example: `TEMPO 0.0 1.0`, `TEMPO 10.0 2.0`, `TEMPO 20.0 1.0`
+- seq_compiler generates tempo curve array in timeline.cc
+
+**Approach B**: Add tempo column to music.track
+- Each pattern trigger can specify tempo_scale override
+- tracker_compiler generates tempo events in music_data.cc
+
+**Benefits**: Non-programmers can edit tempo, easier iteration, version control friendly
+
+**Priority**: Low (current approach works)
+
+### Task #67: DCT/FFT Performance Benchmarking
+Add timing measurements to audio tests.
+
+**Goal**: Compare performance of different DCT/IDCT implementations
+
+**Location**: Add timing code to `test_dct.cc` or `test_fft.cc`
+
+**Measurements**:
+- Reference IDCT/FDCT (naive O(N²))
+- FFT-based DCT/IDCT (current O(N log N))
+- Future x86_64 SIMD-optimized versions
+
+**Output Format**:
+- Average time per transform (microseconds)
+- Throughput (transforms per second)
+- Speedup factor vs reference
+
+**Test Sizes**: DCT_SIZE=512 (production), plus 128, 256, 1024 for scaling
+
+**Implementation**:
+- Use `std::chrono::high_resolution_clock`
+- Run 1000+ iterations to reduce noise
+- Report min/avg/max times
+- Guard with `#if !defined(STRIP_ALL)`
+
+**Priority**: Very Low (nice-to-have)
+
+### Task #69: Convert Audio Pipeline to Clipped Int16
+Use clipped int16 for all audio processing.
+
+**Current**: Float32 throughout (generation, mixing, synthesis, output)
+
+**Goal**: Convert to int16 for faster processing and reduced memory
+
+**Rationale**:
+- Simpler arithmetic (no float operations)
+- Smaller memory footprint (2 bytes vs 4 bytes)
+- Hardware-native format (most audio devices use int16)
+- Eliminates float→int16 conversion at output
+- Natural clipping behavior
+
+**Scope**:
+- Output path: Definitely convert (backends, WAV dump)
+- Synthesis: Consider keeping float32 for quality
+- Mixing: Could use int16 with overflow handling
+- Asset storage: Already int16 in .spec files
+
+**Implementation Phases**:
+1. **Phase 1: Output Only** (~50 lines) - Convert `synth_render()` output to int16
+2. **Phase 2: Mixing Stage** (~200 lines) - Convert voice mixing to int16 arithmetic
+3. **Phase 3: Full Pipeline** (~500+ lines) - Convert spectrograms to int16 storage
+
+**Trade-offs**:
+- Quality loss: 16-bit vs 32-bit float precision
+- Dynamic range: Limited to [-32768, 32767]
+- Clipping: Must handle overflow carefully
+- Code complexity: Saturation arithmetic
+
+**Testing Requirements**:
+- Verify no audible quality degradation
+- Ensure clipping behavior matches float version
+- Check mixing overflow doesn't cause artifacts
+- Validate WAV dumps bit-identical
+
+**Size Impact**:
+- Phase 1: Negligible (~50 bytes)
+- Phase 2: ~100-200 bytes
+- Phase 3: 50% memory, ~1-2KB code savings
+
+**Priority**: Low (final optimization only if 64k budget requires it)
+
+**Notes**: Quality must be validated - may not be worth trade-off
+
+---
+
+## Developer Tools
+
+### Task #66: External Asset Loading for Debugging
+mmap() asset files instead of embedded data.
+
+**Current**: All assets embedded in `assets_data.cc` (regenerate on every change)
+
+**Goal**: Load assets from external files in debug builds for faster iteration
+
+**Scope**: macOS only, non-STRIP_ALL builds only
+
+**Implementation**:
+- Add `DEMO_ENABLE_EXTERNAL_ASSETS` CMake option
+- Modify `GetAsset()` to check for external file first (e.g., `assets/final/<name>`)
+- Use `mmap()` to map file into memory
+- Fallback to embedded data if file not found
+
+**Benefits**: Edit shaders/assets without regenerating assets_data.cc (~10s rebuild)
+
+**Trade-offs**: Adds runtime file I/O, only useful during development
+
+**Priority**: Low (current workflow acceptable)
+
+---
+
+## Visual Effects
+
+### Task #73: Extend Shader Parametrization [IN PROGRESS - 2/4 complete]
+Extend uniform parameter system to remaining effects.
+
+**Goal**: Add parametrization to DistortEffect, SolarizeEffect
+
+**Pattern**: Follow FlashEffect implementation (UniformHelper, params struct, .seq syntax)
+
+**Completed**: ChromaAberrationEffect (offset_scale, angle), GaussianBlurEffect (strength)
+
+**Priority**: Medium (quality-of-life for artists)
+
+**Estimated Impact**: ~200-300 bytes per effect
+
+### Task #52: Procedural SDF Font
+Minimal bezier/spline set for [A-Z, 0-9] and SDF rendering.
+
+### Task #55: SDF Random Planes Intersection
+Implement `sdPolyhedron` (crystal/gem shapes) via plane intersection.
+
+### Task #54: Tracy Integration
+Integrate Tracy debugger for performance profiling.
+
+### Task #58: Advanced Shader Factorization
+Further factorize WGSL code into smaller, reusable snippets.
+
+### Task #59: Comprehensive RNG Library
+Add WGSL snippets for float/vec2/vec3 noise (Perlin, Gyroid, etc.) and random number generators.
+
+### Task #60: OOP Refactoring
+Investigate if more C++ code can be made object-oriented without size penalty (vs functional style).
+
+### Task #61: GPU Procedural Generation
+Implement system to generate procedural data (textures, geometry) on GPU and read back to CPU.
+
+### Task #62: Physics Engine Enhancements (PBD & Rotation)
+- **Task #62.1**: Quaternion rotation for `Object3D` with angular momentum
+- **Task #62.2**: Position Based Dynamics (PBD) - Re-evaluate velocity after resolving collisions/constraints
+
+### Task #63: Refactor Large Files
+Split `src/gpu/gpu.cc`, `src/3d/visual_debug.cc` and `src/gpu/effect.cc` into sub-functionalities.
+
+---
+
+## Performance Optimization
+
+### Task #70: SIMD x86_64 Implementation
+Implement critical functions using intrinsics for x86_64 platforms.
+
+**Goal**: Optimize hot paths for audio and procedural generation
+
+**Scope**:
+- IDCT/FDCT transforms
+- Audio mixing and voice synthesis
+- CPU-side procedural texture/geometry generation
+
+**Constraint**: Non-critical; fallback to generic C++ must be maintained
+
+**Priority**: Very Low
diff --git a/doc/CODING_STYLE.md b/doc/CODING_STYLE.md
new file mode 100644
index 0000000..533cffb
--- /dev/null
+++ b/doc/CODING_STYLE.md
@@ -0,0 +1,109 @@
+# Coding Style Examples
+
+Detailed examples for the project's C++ coding style.
+
+---
+
+## Core Rules Examples
+
+### Const Placement
+```cpp
+const T* name  // Correct
+const T *name  // Wrong
+```
+
+### Pre-Increment
+```cpp
+++x  // Correct
+x++  // Wrong (except when postfix needed)
+```
+
+### Operator Spacing
+```cpp
+x = (a + b) * c;  // Correct - spaces around all operators
+x=(a+b)*c;        // Wrong - no spaces
+```
+
+### No Auto (except complex iterators)
+```cpp
+int count = get_count();              // Correct
+auto count = get_count();             // Wrong
+
+for (auto it = map.begin(); ...)      // OK - complex iterator type
+```
+
+### No C++ Casts
+```cpp
+(int)value                            // Correct
+static_cast<int>(value)               // Wrong
+```
+
+---
+
+## Preprocessor Style
+
+```cpp
+#if defined(MY_TAG)
+  // code here
+#endif /* defined(MY_TAG) */
+```
+
+Always use `defined()` and closing comment.
+
+---
+
+## Struct Initialization
+
+### Good
+```cpp
+const WGPUDescriptor desc = {
+  .format = g_format,
+  .dimension = WGPUTextureViewDimension_2D,
+};
+```
+
+### Bad
+```cpp
+WGPUDescriptor desc = {};
+desc.format = g_format;
+desc.dimension = WGPUTextureViewDimension_2D;
+```
+
+Use designated initializers, not field-by-field assignment.
+
+---
+
+## Class Keywords Indentation
+
+```cpp
+class MyClass {
+ public:   // 1 space indent
+  void foo();
+
+ private:  // 1 space indent
+  int field_;
+};
+```
+
+---
+
+## Comments
+
+### Function Comments
+```cpp
+// Initializes the audio engine with default settings.
+void audio_init() {
+  ...
+}
+```
+
+One-line comment for non-obvious functions.
+
+### File Headers
+```cpp
+// demo64k - 64 kilobyte demo
+// src/audio/synth.cc
+// Audio synthesis engine
+```
+
+Three-line header for all source files.
diff --git a/doc/COMPLETED.md b/doc/COMPLETED.md
index a3c173d..49cfbe9 100644
--- a/doc/COMPLETED.md
+++ b/doc/COMPLETED.md
@@ -29,7 +29,34 @@ Detailed historical documents have been moved to `doc/archive/` for reference:
 
 Use `read @doc/archive/FILENAME.md` to access archived documents.
 
-## Recently Completed (February 8, 2026)
+## Recently Completed (February 9, 2026)
+
+- [x] **WGSL Uniform Buffer Validation & Consolidation (Task #75)**
+    - **Goal**: Standardize uniform buffer usage across all post-process effects and add validation tooling
+    - **Implementation**:
+      - Refactored `DistortEffect` and others to use `CommonPostProcessUniforms` (binding 2) + `EffectParams` (binding 3)
+      - Created `tools/validate_uniforms.py` to parse C++ and WGSL (including embedded strings) and verify size/alignment
+      - Added validation step to CMake build system
+      - Renamed generic `EffectParams` to specific names (`FadeParams`, `CircleMaskParams`, etc.) in WGSL and C++
+      - Added `doc/UNIFORM_BUFFER_GUIDELINES.md` and updated `CONTRIBUTING.md`
+    - **Result**: Consistent binding layout across all effects, automatic validation on build
+
+- [x] **Uniform Buffer Alignment (Task #74)**
+    - **Goal**: Fix WGSL struct alignment issues causing validation errors and crashes
+    - **Implementation**:
+      - `circle_mask_compute.wgsl`: Changed `_pad: vec3<f32>` to three `f32` fields for correct 16-byte alignment
+      - `fade_effect.cc`: Changed EffectParams padding from `vec3<f32>` to `_pad0/1/2: f32`
+      - `theme_modulation_effect.cc`: Same padding fix for EffectParams
+      - Fixed ODR violation in `demo_effects.h` (incomplete FadeEffect forward declaration)
+      - Renamed shadowing `uniforms_` members to `common_uniforms_`/`flash_uniforms_`
+    - **Result**: demo64k runs without crashes, 32/33 tests passing (97%), 0 WebGPU validation errors
+
+- [x] **Fix test_demo Black Screen**
+    - **Issue**: `test_demo` showed black screen because it failed to load its timeline sequence (`assets/test_demo.seq`)
+    - **Fix**: Added missing `LoadTimeline` call in `src/test_demo.cc`
+    - **Result**: `FlashEffect` and `PeakMeterEffect` now render correctly
+
+## Previously Completed (February 8, 2026)
 
 - [x] **Shader Parametrization System (Task #73 Phase 0)** (February 8, 2026)
     - **Goal**: Enable per-frame dynamic parameters for shaders and effects via uniform buffers and .seq syntax
diff --git a/doc/CONTRIBUTING.md b/doc/CONTRIBUTING.md
index 3a09dbc..de6378a 100644
--- a/doc/CONTRIBUTING.md
+++ b/doc/CONTRIBUTING.md
@@ -1,5 +1,7 @@
 # Contributing Guidelines
 
+---
+
 ## Commit Policy
 
 ### Verify Before Committing
@@ -8,7 +10,6 @@
 ```bash
 ./scripts/check_all.sh
 ```
-Runs tests, builds tools, cross-compiles Windows.
 
 **Manual:**
 ```bash
@@ -26,18 +27,9 @@ cd build && ctest --output-on-failure
 cmake -S . -B build_debug_check -DDEMO_ENABLE_DEBUG_LOGS=ON
 cmake --build build_debug_check -j4
 ```
-Must compile without errors.
 
 **Debug macros** (`src/util/debug.h`):
-- `DEBUG_LOG_AUDIO`, `DEBUG_LOG_RING_BUFFER`, `DEBUG_LOG_TRACKER`
-- `DEBUG_LOG_SYNTH`, `DEBUG_LOG_3D`, `DEBUG_LOG_ASSETS`, `DEBUG_LOG_GPU`
-
-Example:
-```cpp
-#if defined(DEBUG_LOG_AUDIO)
-  DEBUG_AUDIO("[CALLBACK #%d] frames=%d\n", ++count, frames);
-#endif
-```
+- `DEBUG_LOG_AUDIO`, `DEBUG_LOG_RING_BUFFER`, `DEBUG_LOG_TRACKER`, `DEBUG_LOG_SYNTH`, `DEBUG_LOG_3D`, `DEBUG_LOG_ASSETS`, `DEBUG_LOG_GPU`
 
 ### Code Formatting
 ```bash
@@ -50,6 +42,8 @@ Never format `third_party/`.
 - 3-line header comment
 - Max 500 lines (split if larger)
 
+---
+
 ## Coding Style
 
 ### Core Rules
@@ -61,36 +55,9 @@ Never format `third_party/`.
 - No `auto` (except complex iterators)
 - No C++ casts (`static_cast`, `reinterpret_cast`)
 
-### Preprocessor
-```cpp
-#if defined(MY_TAG)
-  ...
-#endif /* defined(MY_TAG) */
-```
+See `doc/CODING_STYLE.md` for detailed examples.
 
-### Struct Initialization
-```cpp
-// Good
-const WGPUDescriptor desc = {
-  .format = g_format,
-  .dimension = WGPUTextureViewDimension_2D,
-};
-
-// Bad
-WGPUDescriptor desc = {};
-desc.format = g_format;
-desc.dimension = WGPUTextureViewDimension_2D;
-```
-
-### Class Keywords
-```cpp
- private:  // 1 space indent
-  int field_;
-```
-
-### Comments
-- 1-line comment for non-obvious functions
-- 3-line header for all source files
+---
 
 ## Development Protocols
 
@@ -170,4 +137,18 @@ After hierarchy changes (moving files, renaming), verify:
 ./scripts/gen_coverage_report.sh
 ```
 
-Update scripts with hardcoded paths.
+---
+
+## Uniform Buffer Checklist
+
+To ensure consistency and prevent alignment-related issues:
+
+1. **Define WGSL Structs:** Pay attention to type alignment (`f32`, `vec2`, `vec3`, `vec4`) and use explicit padding where necessary.
+2. **Mirror in C++:** Create corresponding C++ structs that mirror WGSL definitions.
+3. **`static_assert` for Size:** Every C++ struct must have a `static_assert` verifying size matches WGSL.
+4. **Standard Bindings:**
+   - **Binding 2:** Always use `CommonPostProcessUniforms` for per-frame data (resolution, time, beat).
+   - **Binding 3:** Use effect-specific parameter structs for unique data.
+5. **Shader Consistency:** Ensure WGSL shaders correctly declare uniforms at specified bindings.
+6. **Validation Script:** Run `tools/validate_uniforms.py` to catch discrepancies.
+7. **Documentation:** Refer to `doc/UNIFORM_BUFFER_GUIDELINES.md` for detailed alignment rules.
diff --git a/doc/GPU_PROCEDURAL_PHASE4.md b/doc/GPU_PROCEDURAL_PHASE4.md
new file mode 100644
index 0000000..4cfc271
--- /dev/null
+++ b/doc/GPU_PROCEDURAL_PHASE4.md
@@ -0,0 +1,70 @@
+# GPU Procedural Phase 4: Texture Composition
+
+**Status:** ✅ Complete
+
+## Implementation
+
+Multi-input composite shaders with configurable sampler support.
+
+### API
+
+```cpp
+enum class SamplerType {
+  LinearClamp, LinearRepeat, NearestClamp, NearestRepeat
+};
+
+void create_gpu_composite_texture(
+    const std::string& name,
+    const std::string& shader_func,
+    const char* shader_code,
+    const void* uniform_data,
+    size_t uniform_size,
+    int width, int height,
+    const std::vector<std::string>& input_names,
+    SamplerType sampler = SamplerType::LinearClamp);
+```
+
+### Shaders
+
+**gen_blend.wgsl** - Blend two textures with lerp factor:
+- Bindings: output (0), uniform (1), input_a (2), input_b (3), sampler (4)
+- Uniform: `{u32 width, height; f32 blend_factor, _pad0}`
+
+**gen_mask.wgsl** - Multiply textures (masking):
+- Bindings: output (0), uniform (1), input_a (2), input_b (3), sampler (4)
+- Uniform: `{u32 width, height}`
+
+### Usage
+
+```cpp
+extern const char* gen_blend_compute_wgsl;
+
+struct { uint32_t width, height; float blend_factor, _pad0; } uni = {256, 256, 0.5f, 0.0f};
+
+tex_mgr.create_gpu_composite_texture(
+    "blended", "gen_blend", gen_blend_compute_wgsl,
+    &uni, sizeof(uni), 256, 256,
+    {"noise_a", "noise_b"},
+    SamplerType::LinearClamp);
+```
+
+### Features
+
+- **Dynamic bind groups:** N input textures + 1 sampler
+- **Lazy sampler creation:** Map-based cache, 4 preset types
+- **Multi-stage composition:** Composite of composites supported
+- **Guarded with `#if !defined(STRIP_GPU_COMPOSITE)`**
+
+### Size Impact
+
+- Code: ~460 lines added
+- Compressed: ~830 bytes (2 shaders + dispatch logic)
+
+### Tests
+
+`test_gpu_composite.cc`:
+- Blend two noise textures
+- Mask noise with grid
+- Multi-stage composite (composite of composites)
+
+All 35 tests passing.
diff --git a/doc/HOWTO.md b/doc/HOWTO.md
index 967b554..876d7dc 100644
--- a/doc/HOWTO.md
+++ b/doc/HOWTO.md
@@ -2,6 +2,8 @@
 
 Common commands for building and testing.
 
+---
+
 ## Building
 
 ### Debug Build
@@ -11,10 +13,7 @@ cmake --build build -j4
 ./build/demo64k
 ```
 
-Options:
-- `--fullscreen`: Run in fullscreen
-- `--resolution WxH`: Set window size (e.g., 1024x768)
-- `--seek TIME`: Jump to timestamp (debug builds only)
+Options: `--fullscreen`, `--resolution WxH`, `--seek TIME` (debug only)
 
 Keyboard: `Esc` (exit), `F` (toggle fullscreen)
 
@@ -45,27 +44,34 @@ cmake --build build_final -j4
 - STRIP_ALL: Full checks, no debug (~64k target)
 - FINAL_STRIP: No checks, no debug (absolute minimum)
 
-### Developer Build
+### Developer Build (Tests + Tools)
 ```bash
-cmake -S . -B build -DDEMO_ALL_OPTIONS=ON
+cmake -S . -B build -DDEMO_BUILD_TESTS=ON -DDEMO_BUILD_TOOLS=ON
 cmake --build build -j4
 ```
-Enables tests, tools, size optimizations.
+
+**Note:** `DEMO_ALL_OPTIONS=ON` enables tests, tools, AND `STRIP_ALL`, which removes debug-only code. Use selective flags for debugging.
+
+---
 
 ## Build System
 
-**Dependency Tracking**: CMake tracks 42 demo + 17 test assets. Editing shaders/audio auto-triggers rebuild.
+**Dependency Tracking:** CMake tracks 42 demo + 17 test assets. Editing shaders/audio auto-triggers rebuild.
 
-**Header Organization**:
+**Header Organization:**
 - `asset_manager_dcl.h`: Forward declarations
 - `asset_manager.h`: Core API (GetAsset/DropAsset)
 - `asset_manager_utils.h`: Typed helpers
 
+---
+
 ## Git Clone
 ```bash
 git clone ssh://git@51.38.51.127/~/demo.git
 ```
 
+---
+
 ## Audio System
 
 ### AudioEngine API
@@ -90,10 +96,7 @@ audio_shutdown();
 - `seek(time)`: Jump to timestamp (debug only)
 
 **Direct Synth APIs** (performance-critical):
-- `synth_register_spectrogram()`: Register samples
-- `synth_trigger_voice()`: Trigger playback
-- `synth_get_output_peak()`: Get audio level
-- `synth_render()`: Low-level rendering
+- `synth_register_spectrogram()`, `synth_trigger_voice()`, `synth_get_output_peak()`, `synth_render()`
 
 **Testing:**
 ```cpp
@@ -103,6 +106,8 @@ engine.update(1.0f);
 engine.shutdown();
 ```
 
+---
+
 ## Auxiliary Texture Masking
 
 Share textures between effects:
@@ -116,6 +121,8 @@ WGPUTextureView view = demo_->get_auxiliary_view("mask_name");
 ```
 See `doc/MASKING_SYSTEM.md` for details.
 
+---
+
 ## Demo Timeline
 
 Edit `assets/demo.seq`:
@@ -125,6 +132,8 @@ SEQUENCE 0.0 0
 ```
 Rebuild to update timeline.
 
+---
+
 ## Testing
 
 **Run all tests:**
@@ -140,56 +149,7 @@ cd build && ctest
 - `SynthEngineTest`: Audio synthesis
 - `SequenceSystemTest`: Timeline logic
 
-## Code Coverage (macOS)
-```bash
-brew install lcov
-./scripts/gen_coverage_report.sh [target_dir]
-```
-
-## Tools
-
-### Windows Cross-Compilation
-```bash
-./scripts/fetch_win_deps.sh
-./scripts/build_win.sh
-./scripts/run_win.sh
-```
-
-### spectool (Audio Analysis)
-```bash
-cmake -S . -B build -DDEMO_BUILD_TOOLS=ON
-cmake --build build -j4
-
-# Analyze
-./build/spectool analyze input.wav output.spec
-
-# Play
-./build/spectool play input.spec
-```
-
-### specview (Visualization)
-```bash
-./build/specview input.spec
-```
-
-### specplay (Diagnostic)
-```bash
-./build/specplay input.spec
-# or
-./build/specplay input.wav
-```
-Output: Peak, RMS, clipping detection.
-
-### Submodule Updates
-```bash
-cd third_party/wgpu-native
-git fetch
-git checkout trunk
-git reset --hard origin/trunk
-cd ../..
-git add third_party/wgpu-native
-git commit -m "chore: Update wgpu-native"
-```
+---
 
 ## Asset Management
 
@@ -216,3 +176,7 @@ const uint8_t* data = GetAsset(AssetId::KICK_1, &size);
 ```
 
 Build system auto-runs `asset_packer` when asset lists change.
+
+---
+
+For developer tools reference (spectool, Windows cross-compilation, code coverage), see `doc/TOOLS_REFERENCE.md`.
diff --git a/doc/RECIPE.md b/doc/RECIPE.md
new file mode 100644
index 0000000..6404391
--- /dev/null
+++ b/doc/RECIPE.md
@@ -0,0 +1,202 @@
+# Recipe: Common Patterns
+
+Quick reference for implementing common patterns in the demo codebase.
+
+## Runtime Shader Composition
+
+Use `ShaderComposer` to dynamically assemble shaders from snippets.
+
+**Pattern:**
+```cpp
+#include "gpu/effects/shader_composer.h"
+#include "generated/assets.h"
+
+// 1. Load base shader template from asset
+size_t shader_size;
+const char* shader_code =
+    (const char*)GetAsset(AssetId::MY_SHADER_TEMPLATE, &shader_size);
+
+// 2. Define substitutions for dynamic parts
+ShaderComposer::CompositionMap composition_map;
+composition_map["placeholder_name"] = "actual_snippet_name";
+composition_map["fragment_main"] = "plasma_shader";  // Example
+
+// 3. Compose final shader
+std::string composed_shader = ShaderComposer::Get().Compose(
+    {},  // Optional: explicit dependencies
+    std::string(shader_code, shader_size),
+    composition_map);
+
+// 4. Create shader module
+WGPUShaderSourceWGSL wgsl_src = {};
+wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
+wgsl_src.code = str_view(composed_shader.c_str());
+
+WGPUShaderModuleDescriptor shader_desc = {};
+shader_desc.nextInChain = &wgsl_src.chain;
+WGPUShaderModule shader_module =
+    wgpuDeviceCreateShaderModule(ctx_.device, &shader_desc);
+```
+
+**Base shader template (WGSL asset):**
+```wgsl
+// Common bindings
+@group(0) @binding(0) var<uniform> uniforms: CommonUniforms;
+@group(0) @binding(1) var tex_sampler: sampler;
+
+// Placeholder for dynamic fragment code
+#include "fragment_main"
+
+@fragment
+fn fs_main(@location(0) uv: vec2<f32>) -> @location(0) vec4<f32> {
+  return compute_color(uv);  // Implemented by included snippet
+}
+```
+
+**Register snippets at startup:**
+```cpp
+ShaderComposer::Get().RegisterSnippet("plasma_shader", R"(
+fn compute_color(uv: vec2<f32>) -> vec4<f32> {
+  let t = uniforms.time;
+  return vec4(sin(uv.x * 10.0 + t), cos(uv.y * 10.0 + t), 0.5, 1.0);
+}
+)");
+
+ShaderComposer::Get().RegisterSnippet("tunnel_shader", R"(
+fn compute_color(uv: vec2<f32>) -> vec4<f32> {
+  let r = length(uv - vec2(0.5));
+  return vec4(vec3(1.0 / r), 1.0);
+}
+)");
+```
+
+**Example usage:** `src/gpu/effects/rotating_cube_effect.cc:72-75`
+
+## QuadEffect with Auxiliary Textures
+
+Full-screen quad effect with access to previous framebuffer + side textures.
+
+**Binding layout:**
+```
+@group(0) @binding(0) - Previous framebuffer texture
+@group(0) @binding(1) - Sampler
+@group(0) @binding(2) - CommonPostProcessUniforms
+@group(0) @binding(3) - Effect-specific params
+@group(0) @binding(4+) - Auxiliary textures (optional)
+```
+
+**Access auxiliary texture:**
+```cpp
+// In effect init()
+WGPUTextureView aux_view = demo_->get_auxiliary_view("mask_name");
+
+// Bind to binding 4
+const WGPUBindGroupEntry entries[] = {
+    {.binding = 0, .textureView = prev_frame_view},
+    {.binding = 1, .sampler = sampler},
+    {.binding = 2, .buffer = common_uniforms},
+    {.binding = 3, .buffer = effect_params},
+    {.binding = 4, .textureView = aux_view},  // Side texture
+};
+```
+
+**WGSL shader:**
+```wgsl
+@group(0) @binding(0) var prev_frame: texture_2d<f32>;
+@group(0) @binding(1) var tex_sampler: sampler;
+@group(0) @binding(2) var<uniform> common: CommonPostProcessUniforms;
+@group(0) @binding(3) var<uniform> params: EffectParams;
+@group(0) @binding(4) var aux_texture: texture_2d<f32>;
+
+@fragment
+fn fs_main(@location(0) uv: vec2<f32>) -> @location(0) vec4<f32> {
+  let prev = textureSample(prev_frame, tex_sampler, uv);
+  let mask = textureSample(aux_texture, tex_sampler, uv);
+  return mix(prev, compute_effect(uv), mask.r);
+}
+```
+
+## Dynamic Effect Parameters
+
+Use `UniformHelper` for .seq-controllable parameters.
+
+**C++ param struct:**
+```cpp
+struct MyEffectParams {
+  float strength;
+  float speed;
+  float _pad0;
+  float _pad1;
+};
+static_assert(sizeof(MyEffectParams) == 16);
+
+class MyEffect : public Effect {
+ private:
+  UniformHelper<MyEffectParams> params_;
+};
+```
+
+**Effect init:**
+```cpp
+void MyEffect::init(MainSequence* demo) {
+  params_.init(ctx_.device);
+  params_.get().strength = 1.0f;
+  params_.get().speed = 2.0f;
+}
+```
+
+**Update per frame:**
+```cpp
+void MyEffect::render(WGPUTextureView prev, WGPUTextureView target,
+                      float beat, const EffectParams* ep) {
+  params_.apply_optional(ep);  // Updates from .seq
+  params_.upload(ctx_.queue);
+  // ... render pass
+}
+```
+
+**.seq syntax:**
+```
+EFFECT MyEffect 0.0 10.0 strength=0.5 speed=3.0
+EFFECT MyEffect 10.0 20.0 strength=2.0  # speed keeps previous value
+```
+
+**Example:** `src/gpu/effects/flash_effect.cc`, `src/gpu/effects/chroma_aberration_effect.cc`
+
+## Uniform Buffer Alignment
+
+**WGSL padding rules:**
+- `vec3<f32>` requires 16-byte alignment (use padding or switch to `vec4`)
+- Use three `f32` fields instead of single `vec3` when possible
+
+**Correct patterns:**
+```cpp
+// Option 1: Explicit padding
+struct MyUniforms {
+  vec3<f32> color;
+  f32 _pad0;
+  vec2<f32> offset;
+  f32 _pad1;
+  f32 _pad2;
+};
+
+// Option 2: Avoid vec3
+struct MyUniforms {
+  f32 color_r;
+  f32 color_g;
+  f32 color_b;
+  f32 intensity;
+  vec2<f32> offset;
+  f32 _pad0;
+  f32 _pad1;
+};
+```
+
+**Verification:**
+```cpp
+static_assert(sizeof(MyUniforms) == EXPECTED_SIZE);
+```
+
+**Validation:** Run `tools/validate_uniforms.py` before commit.
+
+**Reference:** `doc/UNIFORM_BUFFER_GUIDELINES.md`
diff --git a/doc/TOOLS_REFERENCE.md b/doc/TOOLS_REFERENCE.md
new file mode 100644
index 0000000..61412a9
--- /dev/null
+++ b/doc/TOOLS_REFERENCE.md
@@ -0,0 +1,89 @@
+# Developer Tools Reference
+
+Comprehensive reference for all developer tools in the project.
+
+---
+
+## Windows Cross-Compilation
+
+```bash
+# Fetch dependencies
+./scripts/fetch_win_deps.sh
+
+# Build Windows binary
+./scripts/build_win.sh
+
+# Run with Wine
+./scripts/run_win.sh
+```
+
+---
+
+## spectool (Audio Analysis)
+
+```bash
+# Build
+cmake -S . -B build -DDEMO_BUILD_TOOLS=ON
+cmake --build build -j4
+
+# Analyze WAV → .spec
+./build/spectool analyze input.wav output.spec
+
+# Play .spec file
+./build/spectool play input.spec
+```
+
+---
+
+## specview (Visualization)
+
+```bash
+# View spectrogram
+./build/specview input.spec
+```
+
+Displays spectrogram visualization.
+
+---
+
+## specplay (Diagnostic)
+
+```bash
+# Analyze .spec file
+./build/specplay input.spec
+
+# Or analyze .wav file
+./build/specplay input.wav
+```
+
+Output: Peak, RMS, clipping detection.
+
+---
+
+## Code Coverage (macOS)
+
+```bash
+# Install lcov
+brew install lcov
+
+# Generate coverage report
+./scripts/gen_coverage_report.sh [target_dir]
+```
+
+Creates HTML coverage report.
+
+---
+
+## Submodule Updates
+
+```bash
+cd third_party/wgpu-native
+git fetch
+git checkout trunk
+git reset --hard origin/trunk
+cd ../..
+git add third_party/wgpu-native
+git commit -m "chore: Update wgpu-native"
+```
+
+Updates wgpu-native to latest trunk.
diff --git a/doc/UNIFORM_BUFFER_GUIDELINES.md b/doc/UNIFORM_BUFFER_GUIDELINES.md
new file mode 100644
index 0000000..ac02223
--- /dev/null
+++ b/doc/UNIFORM_BUFFER_GUIDELINES.md
@@ -0,0 +1,106 @@
+# WGSL Uniform Buffer Guidelines
+
+This document outlines the rules and best practices for defining and using uniform buffers in WGSL shaders within this project, focusing on alignment, size, and consistency.
+
+## WGSL Alignment Rules
+
+Understanding WGSL's memory layout rules is crucial for correct uniform buffer implementation. The following are the general alignment requirements for common WGSL types:
+
+- `f32`: 4-byte alignment.
+- `vec2<f32>`: 8-byte alignment (4 bytes per component * 2 components = 8 bytes).
+- `vec3<f32>`: 16-byte alignment (4 bytes per component * 3 components = 12 bytes, padded to 16).
+- `vec4<f32>`: 16-byte alignment (4 bytes per component * 4 components = 16 bytes).
+- `array<T, N>`: The alignment of an array is typically the alignment of its base type `T`.
+
+Structs are padded to the alignment of their largest member. Any trailing space in a struct is also padded to match the maximum alignment of any member within the struct.
+
+## Standard Uniform Buffer Pattern
+
+To maintain consistency and facilitate efficient rendering, a standard pattern for uniform buffer usage is established:
+
+- **Binding 0 & 1:** Reserved for Sampler and Texture access (handled by `pp_update_bind_group`).
+- **Binding 2:** **Common Uniforms** (`CommonPostProcessUniforms` or similar). This buffer should contain frequently used data like resolution, aspect ratio, time, beat, and audio intensity.
+- **Binding 3:** **Effect-Specific Parameters**. This buffer holds parameters unique to a particular effect (e.g., `strength`, `speed`, `fade_amount`).
+
+This pattern ensures that common data is shared efficiently across effects, while effect-specific data remains isolated.
+
+## Defining Uniform Structs
+
+### WGSL Definitions
+
+When defining uniform structs in WGSL, adhere to the following:
+
+- **Explicit Padding:** Use padding fields (`_pad0`, `_pad1`, etc.) where necessary to ensure correct alignment, especially when mixing types of different alignment requirements (e.g., `vec2<f32>` followed by `f32`s).
+- **Use `vec2<f32>` for 8-byte padding:** If you need 8 bytes of padding, use `_pad0: vec2<f32>` instead of `_pad0: f32, _pad1: f32` for potentially better clarity and to leverage WGSL's type system.
+- **Minimize Padding:** Only add padding where required by alignment rules to reduce memory usage.
+
+**Example (CommonPostProcessUniforms / HeptagonUniforms):**
+
+```wgsl
+struct CommonUniforms {
+  resolution: vec2<f32>,
+  _pad0: vec2<f32>, // 8 bytes padding to align subsequent members
+  aspect_ratio: f32,
+  time: f32,
+  beat: f32,
+  audio_intensity: f32,
+};
+// Expected size: 32 bytes
+```
+
+**Example (EffectParams with f32 members):**
+
+```wgsl
+struct EffectParams {
+  parameter1: f32,
+  parameter2: f32,
+  // ... more parameters ...
+};
+// Expected size: 8 bytes (if only two f32s)
+```
+
+### C++ Definitions and Validation
+
+For every WGSL uniform struct, a corresponding C++ struct must exist. This C++ struct must include a `static_assert` to verify its size and alignment matches the WGSL definition.
+
+- **Mirror WGSL Structure:** The C++ struct should mirror the WGSL struct's member order and types as closely as possible to ensure accurate size calculation.
+- **`static_assert`:** Always include `static_assert(sizeof(MyStruct) == EXPECTED_SIZE, "MyStruct must be EXPECTED_SIZE bytes for WGSL alignment");`.
+- **Use `float` for `f32`:** Use `float` for `f32` in C++.
+- **Use `vec2<f32>` mapping:** If WGSL uses `vec2<f32>`, map it to an equivalent C++ type that occupies 8 bytes, typically `float[2]` or a `struct Vec2 { float x, y; }` if more complex type handling is needed.
+- **Padding:** C++ padding rules can differ from WGSL. Pay close attention to `static_assert` for validation.
+
+**Example (C++ CommonPostProcessUniforms):**
+
+```cpp
+struct CommonPostProcessUniforms {
+  vec2 resolution;    // 8 bytes
+  float _pad[2];      // 8 bytes padding (matches vec2<f32> in WGSL)
+  float aspect_ratio; // 4 bytes
+  float time;         // 4 bytes
+  float beat;         // 4 bytes
+  float audio_intensity; // 4 bytes
+};
+static_assert(sizeof(CommonPostProcessUniforms) == 32, 
+              "CommonPostProcessUniforms must be 32 bytes for WGSL alignment");
+```
+
+**Example (C++ GaussianBlurParams):**
+
+```cpp
+struct GaussianBlurParams {
+  float strength = 2.0f;
+  float _pad = 0.0f;
+};
+static_assert(sizeof(GaussianBlurParams) == 8, 
+              "GaussianBlurParams must be 8 bytes for WGSL alignment");
+```
+
+## Handling Common Pitfalls
+
+- **`vec3<f32>` Padding:** Avoid using `vec3<f32>` for padding in WGSL, as it has a 16-byte alignment. If padding is needed, use `vec2<f32>` for 8 bytes or individual `f32`s for 4-byte alignment.
+- **C++ vs. WGSL Alignment:** Always rely on `static_assert` in C++ and verify against WGSL alignment rules. C++ padding rules might differ, and the `static_assert` is the ultimate arbiter.
+- **Unmatched Structs:** Ensure every WGSL uniform struct has a corresponding C++ struct with a matching `static_assert`.
+
+## Validation Tool
+
+The `tools/validate_uniforms.py` script is integrated into the build system. It automatically checks for inconsistencies between WGSL and C++ uniform struct definitions and reports any size mismatches. Ensure this script passes for all new or modified uniform definitions.
diff --git a/scripts/gen_spectrograms.sh b/scripts/gen_spectrograms.sh
index a5c1510..3213787 100755
--- a/scripts/gen_spectrograms.sh
+++ b/scripts/gen_spectrograms.sh
@@ -7,7 +7,7 @@ set -euo pipefail
 # --- Configuration ---
 PROJECT_ROOT=$(git rev-parse --show-toplevel)
 SOURCE_DIR="${PROJECT_ROOT}/assets/originals"
-DEST_DIR="${PROJECT_ROOT}/assets/final"
+DEST_DIR="${PROJECT_ROOT}/assets/final/music"
 SPECTOOL_PATH="${PROJECT_ROOT}/build/spectool"
 TEMP_WAV_DIR=$(mktemp -d)
 
diff --git a/src/3d/visual_debug.cc b/src/3d/visual_debug.cc
index 77311f6..cd4ccce 100644
--- a/src/3d/visual_debug.cc
+++ b/src/3d/visual_debug.cc
@@ -26,7 +26,7 @@ void VisualDebug::init(WGPUDevice device, WGPUTextureFormat format) {
 
   WGPUBufferDescriptor ub_desc = {};
   ub_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
-  ub_desc.size = sizeof(mat4);
+  ub_desc.size = sizeof(GlobalUniforms);
   uniform_buffer_ = wgpuDeviceCreateBuffer(device_, &ub_desc);
 }
 
@@ -340,9 +340,12 @@ void VisualDebug::add_trajectory(const std::vector<vec3>& points,
 }
 
 void VisualDebug::update_buffers(const mat4& view_proj) {
-  // Update Uniforms
+  // Update Uniforms - fill entire GlobalUniforms structure
+  GlobalUniforms uniforms = {};
+  uniforms.view_proj = view_proj;
+  // Other fields zeroed (not used by visual debug shader)
   wgpuQueueWriteBuffer(wgpuDeviceGetQueue(device_), uniform_buffer_, 0,
-                       &view_proj, sizeof(mat4));
+                       &uniforms, sizeof(GlobalUniforms));
 
   // Update Vertices
   size_t required_size = lines_.size() * 2 * sizeof(float) * 6;
@@ -385,7 +388,7 @@ void VisualDebug::update_buffers(const mat4& view_proj) {
     WGPUBindGroupEntry bg_entry = {};
     bg_entry.binding = 0;
     bg_entry.buffer = uniform_buffer_;
-    bg_entry.size = sizeof(mat4);
+    bg_entry.size = sizeof(GlobalUniforms);
 
     WGPUBindGroupDescriptor bg_desc = {};
     bg_desc.layout = bind_group_layout_;
diff --git a/src/audio/audio.cc b/src/audio/audio.cc
index 2f485a6..c5bd3d9 100644
--- a/src/audio/audio.cc
+++ b/src/audio/audio.cc
@@ -65,9 +65,11 @@ void audio_start() {
   g_audio_backend->start();
 }
 
-void audio_render_ahead(float music_time, float dt) {
+void audio_render_ahead(float music_time, float dt, float target_fill) {
   // Target: maintain look-ahead buffer
-  const float target_lookahead = (float)RING_BUFFER_LOOKAHEAD_MS / 1000.0f;
+  const float target_lookahead = (target_fill < 0.0f)
+    ? (float)RING_BUFFER_LOOKAHEAD_MS / 1000.0f
+    : target_fill;
 
   // Render in small chunks to keep synth time synchronized with tracker
   // Chunk size: one frame's worth of audio (~16.6ms @ 60fps)
diff --git a/src/audio/audio.h b/src/audio/audio.h
index e063a57..778d312 100644
--- a/src/audio/audio.h
+++ b/src/audio/audio.h
@@ -24,7 +24,8 @@ void audio_init();
 void audio_start(); // Starts the audio device callback
 
 // Ring buffer audio rendering (main thread fills buffer)
-void audio_render_ahead(float music_time, float dt);
+// target_fill: Target buffer fill time in seconds (default: RING_BUFFER_LOOKAHEAD_MS/1000)
+void audio_render_ahead(float music_time, float dt, float target_fill = -1.0f);
 
 // Get current playback time (in seconds) based on samples consumed
 // This is the ring buffer READ position (what's being played NOW)
diff --git a/src/gpu/demo_effects.h b/src/gpu/demo_effects.h
index 54bf657..ff7e017 100644
--- a/src/gpu/demo_effects.h
+++ b/src/gpu/demo_effects.h
@@ -7,12 +7,14 @@
 #include "3d/scene.h"
 #include "effect.h"
 #include "gpu/effects/circle_mask_effect.h"
-#include "gpu/effects/fade_effect.h" // FadeEffect with full definition
+#include "gpu/effects/fade_effect.h"  // FadeEffect with full definition
 #include "gpu/effects/flash_effect.h" // FlashEffect with params support
 #include "gpu/effects/post_process_helper.h"
 #include "gpu/effects/rotating_cube_effect.h"
 #include "gpu/effects/shaders.h"
 #include "gpu/effects/theme_modulation_effect.h" // ThemeModulationEffect with full definition
+#include "gpu/effects/hybrid_3d_effect.h"
+#include "gpu/effects/flash_cube_effect.h"
 #include "gpu/gpu.h"
 #include "gpu/texture_manager.h"
 #include "gpu/uniform_helper.h"
@@ -49,7 +51,6 @@ class ParticlesEffect : public Effect {
   ComputePass compute_pass_;
   RenderPass render_pass_;
   GpuBuffer particles_buffer_;
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
 };
 
 class PassthroughEffect : public PostProcessEffect {
@@ -58,7 +59,6 @@ class PassthroughEffect : public PostProcessEffect {
   void update_bind_group(WGPUTextureView input_view) override;
 
  private:
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
 };
 
 class MovingEllipseEffect : public Effect {
@@ -83,7 +83,6 @@ class ParticleSprayEffect : public Effect {
   ComputePass compute_pass_;
   RenderPass render_pass_;
   GpuBuffer particles_buffer_;
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
 };
 
 // Parameters for GaussianBlurEffect (set at construction time)
@@ -106,7 +105,6 @@ class GaussianBlurEffect : public PostProcessEffect {
 
  private:
   GaussianBlurParams params_;
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
   UniformBuffer<GaussianBlurParams> params_buffer_;
 };
 
@@ -118,7 +116,6 @@ class SolarizeEffect : public PostProcessEffect {
   void update_bind_group(WGPUTextureView input_view) override;
 
  private:
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
 };
 
 // Parameters for VignetteEffect
@@ -137,7 +134,6 @@ class VignetteEffect : public PostProcessEffect {
 
  private:
   VignetteParams params_;
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
   UniformBuffer<VignetteParams> params_buffer_;
 };
 
@@ -160,48 +156,33 @@ class ChromaAberrationEffect : public PostProcessEffect {
 
  private:
   ChromaAberrationParams params_;
-  UniformBuffer<CommonPostProcessUniforms> uniforms_;
   UniformBuffer<ChromaAberrationParams> params_buffer_;
 };
 
-class Hybrid3DEffect : public Effect {
- public:
-  Hybrid3DEffect(const GpuContext& ctx);
-  void init(MainSequence* demo) override;
-  void render(WGPURenderPassEncoder pass, float time, float beat,
-              float intensity, float aspect_ratio) override;
-
- private:
-  Renderer3D renderer_;
-  TextureManager texture_manager_;
-  Scene scene_;
-  Camera camera_;
-  int width_ = 1280;
-  int height_ = 720;
+// Parameters for DistortEffect
+struct DistortParams {
+  float strength = 0.01f; // Default distortion strength
+  float speed = 1.0f;     // Default distortion speed
 };
+static_assert(sizeof(DistortParams) == 8, "DistortParams must be 8 bytes for WGSL alignment");
 
-class FlashCubeEffect : public Effect {
+class DistortEffect : public PostProcessEffect {
  public:
-  FlashCubeEffect(const GpuContext& ctx);
-  void init(MainSequence* demo) override;
-  void resize(int width, int height) override;
+  DistortEffect(const GpuContext& ctx);
+  DistortEffect(const GpuContext& ctx, const DistortParams& params);
   void render(WGPURenderPassEncoder pass, float time, float beat,
               float intensity, float aspect_ratio) override;
+  void update_bind_group(WGPUTextureView input_view) override;
 
  private:
-  Renderer3D renderer_;
-  TextureManager texture_manager_;
-  Scene scene_;
-  Camera camera_;
-  int width_ = 1280;
-  int height_ = 720;
-  float last_beat_;
-  float flash_intensity_;
+  DistortParams params_;
+  UniformBuffer<DistortParams> params_buffer_;
 };
 
-// ThemeModulationEffect now defined in gpu/effects/theme_modulation_effect.h (included above)
-// FadeEffect now defined in gpu/effects/fade_effect.h (included above)
-// FlashEffect now defined in gpu/effects/flash_effect.h (included above)
+// ThemeModulationEffect now defined in gpu/effects/theme_modulation_effect.h
+// (included above) FadeEffect now defined in gpu/effects/fade_effect.h
+// (included above) FlashEffect now defined in gpu/effects/flash_effect.h
+// (included above)
 
 // Auto-generated functions
 void LoadTimeline(MainSequence& main_seq, const GpuContext& ctx);
diff --git a/src/gpu/effect.h b/src/gpu/effect.h
index 6fdb0f4..8f35f3c 100644
--- a/src/gpu/effect.h
+++ b/src/gpu/effect.h
@@ -1,5 +1,7 @@
 #pragma once
 #include "gpu/gpu.h"
+#include "gpu/effects/post_process_helper.h"
+#include "gpu/uniform_helper.h"
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -12,6 +14,7 @@ class PostProcessEffect;
 class Effect {
  public:
   Effect(const GpuContext& ctx) : ctx_(ctx) {
+    uniforms_.init(ctx.device);
   }
   virtual ~Effect() = default;
   virtual void init(MainSequence* demo) {
@@ -43,7 +46,7 @@ class Effect {
 
  protected:
   const GpuContext& ctx_;
-  GpuBuffer uniforms_;
+  UniformBuffer<CommonPostProcessUniforms> uniforms_;
   int width_ = 1280;
   int height_ = 720;
 };
diff --git a/src/gpu/effects/chroma_aberration_effect.cc b/src/gpu/effects/chroma_aberration_effect.cc
index 7f41153..af3acc5 100644
--- a/src/gpu/effects/chroma_aberration_effect.cc
+++ b/src/gpu/effects/chroma_aberration_effect.cc
@@ -18,7 +18,6 @@ ChromaAberrationEffect::ChromaAberrationEffect(
     : PostProcessEffect(ctx), params_(params) {
   pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format,
                                            chroma_aberration_shader_wgsl);
-  uniforms_.init(ctx_.device);
   params_buffer_.init(ctx_.device);
 }
 
diff --git a/src/gpu/effects/circle_mask_effect.cc b/src/gpu/effects/circle_mask_effect.cc
index 5b71086..ca80cf9 100644
--- a/src/gpu/effects/circle_mask_effect.cc
+++ b/src/gpu/effects/circle_mask_effect.cc
@@ -3,6 +3,7 @@
 // Generates circular mask and renders green background outside circle.
 
 #include "gpu/effects/circle_mask_effect.h"
+#include "gpu/effects/shader_composer.h"
 #include "generated/assets.h"
 
 CircleMaskEffect::CircleMaskEffect(const GpuContext& ctx, float radius)
@@ -30,9 +31,7 @@ void CircleMaskEffect::init(MainSequence* demo) {
 
   demo_->register_auxiliary_texture("circle_mask", width, height);
 
-  compute_uniforms_.init(ctx_.device);
   compute_params_.init(ctx_.device);
-  render_uniforms_.init(ctx_.device);
 
   WGPUSamplerDescriptor sampler_desc = {};
   sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge;
@@ -49,9 +48,12 @@ void CircleMaskEffect::init(MainSequence* demo) {
   const char* render_shader = (const char*)GetAsset(
       AssetId::ASSET_CIRCLE_MASK_RENDER_SHADER, &render_size);
 
+  // Compose shaders to resolve #include directives
+  std::string composed_compute = ShaderComposer::Get().Compose({}, compute_shader);
+
   WGPUShaderSourceWGSL compute_wgsl = {};
   compute_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
-  compute_wgsl.code = str_view(compute_shader);
+  compute_wgsl.code = str_view(composed_compute.c_str());
 
   WGPUShaderModuleDescriptor compute_desc = {};
   compute_desc.nextInChain = &compute_wgsl.chain;
@@ -82,11 +84,11 @@ void CircleMaskEffect::init(MainSequence* demo) {
 
   const WGPUBindGroupEntry compute_entries[] = {
       {.binding = 0,
-       .buffer = compute_uniforms_.get().buffer,
+       .buffer = uniforms_.get().buffer,
        .size = sizeof(CommonPostProcessUniforms)},
       {.binding = 1,
        .buffer = compute_params_.get().buffer,
-       .size = sizeof(EffectParams)},
+       .size = sizeof(CircleMaskParams)},
   };
   const WGPUBindGroupDescriptor compute_bg_desc = {
       .layout = wgpuRenderPipelineGetBindGroupLayout(compute_pipeline_, 0),
@@ -96,9 +98,11 @@ void CircleMaskEffect::init(MainSequence* demo) {
   compute_bind_group_ =
       wgpuDeviceCreateBindGroup(ctx_.device, &compute_bg_desc);
 
+  std::string composed_render = ShaderComposer::Get().Compose({}, render_shader);
+
   WGPUShaderSourceWGSL render_wgsl = {};
   render_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
-  render_wgsl.code = str_view(render_shader);
+  render_wgsl.code = str_view(composed_render.c_str());
 
   WGPUShaderModuleDescriptor render_desc = {};
   render_desc.nextInChain = &render_wgsl.chain;
@@ -139,7 +143,7 @@ void CircleMaskEffect::init(MainSequence* demo) {
       {.binding = 0, .textureView = mask_view},
       {.binding = 1, .sampler = mask_sampler_},
       {.binding = 2,
-       .buffer = render_uniforms_.get().buffer,
+       .buffer = uniforms_.get().buffer,
        .size = sizeof(CommonPostProcessUniforms)},
   };
   const WGPUBindGroupDescriptor render_bg_desc = {
@@ -160,9 +164,9 @@ void CircleMaskEffect::compute(WGPUCommandEncoder encoder, float time,
       .beat = beat,
       .audio_intensity = intensity,
   };
-  compute_uniforms_.update(ctx_.queue, uniforms);
+  uniforms_.update(ctx_.queue, uniforms);
 
-  const EffectParams params = {
+  const CircleMaskParams params = {
       .radius = radius_,
   };
   compute_params_.update(ctx_.queue, params);
@@ -199,7 +203,7 @@ void CircleMaskEffect::render(WGPURenderPassEncoder pass, float time,
       .beat = beat,
       .audio_intensity = intensity,
   };
-  render_uniforms_.update(ctx_.queue, uniforms);
+  uniforms_.update(ctx_.queue, uniforms);
 
   wgpuRenderPassEncoderSetPipeline(pass, render_pipeline_);
   wgpuRenderPassEncoderSetBindGroup(pass, 0, render_bind_group_, 0, nullptr);
diff --git a/src/gpu/effects/circle_mask_effect.h b/src/gpu/effects/circle_mask_effect.h
index ac44210..2ddbb11 100644
--- a/src/gpu/effects/circle_mask_effect.h
+++ b/src/gpu/effects/circle_mask_effect.h
@@ -21,23 +21,23 @@ class CircleMaskEffect : public Effect {
               float intensity, float aspect_ratio) override;
 
  private:
-  struct EffectParams {
+  struct CircleMaskParams {
     float radius;
     float _pad[3];
   };
+  static_assert(sizeof(CircleMaskParams) == 16,
+                "CircleMaskParams must be 16 bytes for WGSL alignment");
 
   MainSequence* demo_ = nullptr;
   float radius_;
 
   WGPURenderPipeline compute_pipeline_ = nullptr;
   WGPUBindGroup compute_bind_group_ = nullptr;
-  UniformBuffer<CommonPostProcessUniforms> compute_uniforms_;
-  UniformBuffer<EffectParams> compute_params_;
+  UniformBuffer<CircleMaskParams> compute_params_;
 
   WGPURenderPipeline render_pipeline_ = nullptr;
   WGPUBindGroup render_bind_group_ = nullptr;
   WGPUSampler mask_sampler_ = nullptr;
-  UniformBuffer<CommonPostProcessUniforms> render_uniforms_;
 };
 
 #endif /* CIRCLE_MASK_EFFECT_H_ */
diff --git a/src/gpu/effects/distort_effect.cc b/src/gpu/effects/distort_effect.cc
index d11dfd7..52a8ec7 100644
--- a/src/gpu/effects/distort_effect.cc
+++ b/src/gpu/effects/distort_effect.cc
@@ -9,31 +9,35 @@ DistortEffect::DistortEffect(const GpuContext& ctx)
     : DistortEffect(ctx, DistortParams()) {
 }
 
-DistortEffect::DistEffect(const GpuContext& ctx, const DistortParams& params)
+DistortEffect::DistortEffect(const GpuContext& ctx, const DistortParams& params)
     : PostProcessEffect(ctx), params_(params) {
-  uniforms_ =
-      gpu_create_buffer(ctx_.device, sizeof(DistortUniforms),
-                        WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst);
+  params_buffer_.init(ctx_.device);
   pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format,
                                            distort_shader_wgsl);
 }
 
 void DistortEffect::render(WGPURenderPassEncoder pass, float t, float b,
                            float i, float a) {
-  DistortUniforms u = {
+  // Populate CommonPostProcessUniforms
+  const CommonPostProcessUniforms common_u = {
+      .resolution = {(float)width_, (float)height_},
+      .aspect_ratio = a,
       .time = t,
       .beat = b,
-      .intensity = i,
-      .aspect_ratio = a,
-      .width = (float)width_,
-      .height = (float)height_,
+      .audio_intensity = i,
+  };
+  uniforms_.update(ctx_.queue, common_u);
+
+  // Populate DistortParams
+  const DistortParams distort_p = {
       .strength = params_.strength,
       .speed = params_.speed,
   };
-  wgpuQueueWriteBuffer(ctx_.queue, uniforms_.buffer, 0, &u, sizeof(u));
+  params_buffer_.update(ctx_.queue, distort_p);
+
   PostProcessEffect::render(pass, t, b, i, a);
 }
 
 void DistortEffect::update_bind_group(WGPUTextureView v) {
-  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, v, {}, uniforms_);
+  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, v, uniforms_.get(), params_buffer_);
 }
 \ No newline at end of file
diff --git a/src/gpu/effects/fade_effect.cc b/src/gpu/effects/fade_effect.cc
index 3efc583..39b54e0 100644
--- a/src/gpu/effects/fade_effect.cc
+++ b/src/gpu/effects/fade_effect.cc
@@ -5,6 +5,12 @@
 #include "gpu/effects/post_process_helper.h"
 #include <cmath>
 
+struct FadeParams {
+  float fade_amount;
+  float _pad[3];
+};
+static_assert(sizeof(FadeParams) == 16, "FadeParams must be 16 bytes for WGSL alignment");
+
 FadeEffect::FadeEffect(const GpuContext& ctx) : PostProcessEffect(ctx) {
   const char* shader_code = R"(
     struct VertexOutput {
@@ -22,7 +28,7 @@ FadeEffect::FadeEffect(const GpuContext& ctx) : PostProcessEffect(ctx) {
       audio_intensity: f32,
     };
 
-    struct EffectParams {
+    struct FadeParams {
       fade_amount: f32,
       _pad0: f32,
       _pad1: f32,
@@ -32,7 +38,7 @@ FadeEffect::FadeEffect(const GpuContext& ctx) : PostProcessEffect(ctx) {
     @group(0) @binding(0) var inputSampler: sampler;
     @group(0) @binding(1) var inputTexture: texture_2d<f32>;
     @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
-    @group(0) @binding(3) var<uniform> params: EffectParams;
+    @group(0) @binding(3) var<uniform> params: FadeParams;
 
     @vertex
     fn vs_main(@builtin(vertex_index) vertexIndex: u32) -> VertexOutput {
@@ -57,14 +63,13 @@ FadeEffect::FadeEffect(const GpuContext& ctx) : PostProcessEffect(ctx) {
 
   pipeline_ =
       create_post_process_pipeline(ctx_.device, ctx_.format, shader_code);
-  common_uniforms_.init(ctx_.device);
   params_buffer_ = gpu_create_buffer(
       ctx_.device, 16, WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst);
 }
 
 void FadeEffect::update_bind_group(WGPUTextureView input_view) {
   pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, input_view,
-                       common_uniforms_.get(), params_buffer_);
+                       uniforms_.get(), params_buffer_);
 }
 
 void FadeEffect::render(WGPURenderPassEncoder pass, float time, float beat,
@@ -76,7 +81,7 @@ void FadeEffect::render(WGPURenderPassEncoder pass, float time, float beat,
       .beat = beat,
       .audio_intensity = intensity,
   };
-  common_uniforms_.update(ctx_.queue, u);
+  uniforms_.update(ctx_.queue, u);
 
   // Example fade pattern: fade in at start, fade out at end
   // Customize this based on your needs
@@ -90,8 +95,8 @@ void FadeEffect::render(WGPURenderPassEncoder pass, float time, float beat,
     fade_amount = fmaxf(fade_amount, 0.0f);
   }
 
-  float params[4] = {fade_amount, 0.0f, 0.0f, 0.0f};
-  wgpuQueueWriteBuffer(ctx_.queue, params_buffer_.buffer, 0, params,
+  FadeParams params = {fade_amount, {0.0f, 0.0f, 0.0f}};
+  wgpuQueueWriteBuffer(ctx_.queue, params_buffer_.buffer, 0, &params,
                        sizeof(params));
 
   wgpuRenderPassEncoderSetPipeline(pass, pipeline_);
diff --git a/src/gpu/effects/fade_effect.h b/src/gpu/effects/fade_effect.h
index 22b8f76..178c360 100644
--- a/src/gpu/effects/fade_effect.h
+++ b/src/gpu/effects/fade_effect.h
@@ -4,9 +4,9 @@
 #pragma once
 
 #include "gpu/effect.h"
+#include "gpu/effects/post_process_helper.h"
 #include "gpu/gpu.h"
 #include "gpu/uniform_helper.h"
-#include "gpu/effects/post_process_helper.h"
 
 class FadeEffect : public PostProcessEffect {
  public:
@@ -16,6 +16,5 @@ class FadeEffect : public PostProcessEffect {
   void update_bind_group(WGPUTextureView input_view) override;
 
  private:
-  UniformBuffer<CommonPostProcessUniforms> common_uniforms_;
   GpuBuffer params_buffer_;
 };
diff --git a/src/gpu/effects/flash_cube_effect.h b/src/gpu/effects/flash_cube_effect.h
index 7089af2..5faeb00 100644
--- a/src/gpu/effects/flash_cube_effect.h
+++ b/src/gpu/effects/flash_cube_effect.h
@@ -22,8 +22,6 @@ class FlashCubeEffect : public Effect {
   TextureManager texture_manager_;
   Scene scene_;
   Camera camera_;
-  int width_ = 1280;
-  int height_ = 720;
   float last_beat_ = 0.0f;
   float flash_intensity_ = 0.0f;
 };
diff --git a/src/gpu/effects/gaussian_blur_effect.cc b/src/gpu/effects/gaussian_blur_effect.cc
index 0cc4821..697be88 100644
--- a/src/gpu/effects/gaussian_blur_effect.cc
+++ b/src/gpu/effects/gaussian_blur_effect.cc
@@ -18,7 +18,6 @@ GaussianBlurEffect::GaussianBlurEffect(const GpuContext& ctx,
     : PostProcessEffect(ctx), params_(params) {
   pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format,
                                            gaussian_blur_shader_wgsl);
-  uniforms_.init(ctx_.device);
   params_buffer_.init(ctx_.device);
 }
 
diff --git a/src/gpu/effects/heptagon_effect.cc b/src/gpu/effects/heptagon_effect.cc
index b77ec53..7b0702d 100644
--- a/src/gpu/effects/heptagon_effect.cc
+++ b/src/gpu/effects/heptagon_effect.cc
@@ -5,39 +5,25 @@
 #include "gpu/gpu.h"
 #include "util/mini_math.h"
 
-// Match CommonUniforms struct from main_shader.wgsl.
-// Padded to 32 bytes for WGSL alignment rules.
-struct HeptagonUniforms {
-  vec2 resolution;    // 8 bytes
-  float _pad0[2];     // 8 bytes padding to align next float
-  float aspect_ratio; // 4 bytes
-  float time;         // 4 bytes
-  float beat;         // 4 bytes
-  float audio_intensity; // 4 bytes
-};
-static_assert(sizeof(HeptagonUniforms) == 32,
-              "HeptagonUniforms must be 32 bytes for WGSL alignment");
-
 // --- HeptagonEffect ---
 HeptagonEffect::HeptagonEffect(const GpuContext& ctx) : Effect(ctx) {
-  uniforms_ =
-      gpu_create_buffer(ctx_.device, sizeof(HeptagonUniforms),
-                        WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst);
-  ResourceBinding bindings[] = {{uniforms_, WGPUBufferBindingType_Uniform}};
+  // uniforms_ is initialized by Effect base class
+  ResourceBinding bindings[] = {{uniforms_.get(), WGPUBufferBindingType_Uniform}};
   pass_ = gpu_create_render_pass(ctx_.device, ctx_.format, main_shader_wgsl,
                                  bindings, 1);
   pass_.vertex_count = 21;
 }
 void HeptagonEffect::render(WGPURenderPassEncoder pass, float t, float b,
                             float i, float a) {
-  HeptagonUniforms u = {
+  CommonPostProcessUniforms u = {
       .resolution = {(float)width_, (float)height_},
+      ._pad = {0.0f, 0.0f},
       .aspect_ratio = a,
       .time = t,
       .beat = b,
       .audio_intensity = i,
   };
-  wgpuQueueWriteBuffer(ctx_.queue, uniforms_.buffer, 0, &u, sizeof(u));
+  uniforms_.update(ctx_.queue, u);
   wgpuRenderPassEncoderSetPipeline(pass, pass_.pipeline);
   wgpuRenderPassEncoderSetBindGroup(pass, 0, pass_.bind_group, 0, nullptr);
   wgpuRenderPassEncoderDraw(pass, pass_.vertex_count, 1, 0, 0);
diff --git a/src/gpu/effects/moving_ellipse_effect.cc b/src/gpu/effects/moving_ellipse_effect.cc
index 945f807..9866f20 100644
--- a/src/gpu/effects/moving_ellipse_effect.cc
+++ b/src/gpu/effects/moving_ellipse_effect.cc
@@ -7,10 +7,8 @@
 
 // --- MovingEllipseEffect ---
 MovingEllipseEffect::MovingEllipseEffect(const GpuContext& ctx) : Effect(ctx) {
-  uniforms_ =
-      gpu_create_buffer(ctx_.device, sizeof(CommonPostProcessUniforms),
-                        WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst);
-  ResourceBinding bindings[] = {{uniforms_, WGPUBufferBindingType_Uniform}};
+  // uniforms_ is initialized by Effect base class
+  ResourceBinding bindings[] = {{uniforms_.get(), WGPUBufferBindingType_Uniform}};
   pass_ = gpu_create_render_pass(ctx_.device, ctx_.format, ellipse_shader_wgsl,
                                  bindings, 1);
   pass_.vertex_count = 3;
@@ -19,12 +17,13 @@ void MovingEllipseEffect::render(WGPURenderPassEncoder pass, float t, float b,
                                  float i, float a) {
   const CommonPostProcessUniforms u = {
       .resolution = {(float)width_, (float)height_},
+      ._pad = {0.0f, 0.0f},
       .aspect_ratio = a,
       .time = t,
       .beat = b,
       .audio_intensity = i,
   };
-  wgpuQueueWriteBuffer(ctx_.queue, uniforms_.buffer, 0, &u, sizeof(u));
+  uniforms_.update(ctx_.queue, u);
   wgpuRenderPassEncoderSetPipeline(pass, pass_.pipeline);
   wgpuRenderPassEncoderSetBindGroup(pass, 0, pass_.bind_group, 0, nullptr);
   wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
diff --git a/src/gpu/effects/particle_spray_effect.cc b/src/gpu/effects/particle_spray_effect.cc
index 3fd2590..a435884 100644
--- a/src/gpu/effects/particle_spray_effect.cc
+++ b/src/gpu/effects/particle_spray_effect.cc
@@ -8,7 +8,6 @@
 
 // --- ParticleSprayEffect ---
 ParticleSprayEffect::ParticleSprayEffect(const GpuContext& ctx) : Effect(ctx) {
-  uniforms_.init(ctx_.device);
   std::vector<Particle> init_p(NUM_PARTICLES);
   for (Particle& p : init_p)
     p.pos[3] = 0.0f;
diff --git a/src/gpu/effects/particles_effect.cc b/src/gpu/effects/particles_effect.cc
index 01f90a5..cd0df74 100644
--- a/src/gpu/effects/particles_effect.cc
+++ b/src/gpu/effects/particles_effect.cc
@@ -8,7 +8,6 @@
 
 // --- ParticlesEffect ---
 ParticlesEffect::ParticlesEffect(const GpuContext& ctx) : Effect(ctx) {
-  uniforms_.init(ctx_.device);
   std::vector<Particle> init_p(NUM_PARTICLES);
   particles_buffer_ = gpu_create_buffer(
       ctx_.device, sizeof(Particle) * NUM_PARTICLES,
diff --git a/src/gpu/effects/passthrough_effect.cc b/src/gpu/effects/passthrough_effect.cc
index 93cf948..01d557a 100644
--- a/src/gpu/effects/passthrough_effect.cc
+++ b/src/gpu/effects/passthrough_effect.cc
@@ -7,7 +7,6 @@
 // --- PassthroughEffect ---
 PassthroughEffect::PassthroughEffect(const GpuContext& ctx)
     : PostProcessEffect(ctx) {
-  uniforms_.init(ctx_.device);
   pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format,
                                            passthrough_shader_wgsl);
 }
diff --git a/src/gpu/effects/post_process_helper.cc b/src/gpu/effects/post_process_helper.cc
index 74e052d..e99467f 100644
--- a/src/gpu/effects/post_process_helper.cc
+++ b/src/gpu/effects/post_process_helper.cc
@@ -4,16 +4,19 @@
 #include "post_process_helper.h"
 #include "../demo_effects.h"
 #include "gpu/gpu.h"
+#include "gpu/effects/shader_composer.h"
 #include <cstring>
 
 // Helper to create a standard post-processing pipeline
 WGPURenderPipeline create_post_process_pipeline(WGPUDevice device,
                                                 WGPUTextureFormat format,
                                                 const char* shader_code) {
+  std::string composed_shader = ShaderComposer::Get().Compose({}, shader_code);
+
   WGPUShaderModuleDescriptor shader_desc = {};
   WGPUShaderSourceWGSL wgsl_src = {};
   wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_src.code = str_view(shader_code);
+  wgsl_src.code = str_view(composed_shader.c_str());
   shader_desc.nextInChain = &wgsl_src.chain;
   WGPUShaderModule shader_module =
       wgpuDeviceCreateShaderModule(device, &shader_desc);
@@ -94,7 +97,8 @@ void pp_update_bind_group(WGPUDevice device, WGPURenderPipeline pipeline,
   bge[2].buffer = uniforms.buffer;
   bge[2].size = uniforms.size;
   bge[3].binding = PP_BINDING_EFFECT_PARAMS;
-  bge[3].buffer = effect_params.buffer ? effect_params.buffer : g_dummy_buffer.buffer;
+  bge[3].buffer =
+      effect_params.buffer ? effect_params.buffer : g_dummy_buffer.buffer;
   bge[3].size = effect_params.buffer ? effect_params.size : g_dummy_buffer.size;
   WGPUBindGroupDescriptor bgd = {
       .layout = bgl, .entryCount = 4, .entries = bge};
diff --git a/src/gpu/effects/post_process_helper.h b/src/gpu/effects/post_process_helper.h
index 77b184f..23cde0e 100644
--- a/src/gpu/effects/post_process_helper.h
+++ b/src/gpu/effects/post_process_helper.h
@@ -19,10 +19,10 @@ static_assert(sizeof(CommonPostProcessUniforms) == 32,
               "CommonPostProcessUniforms must be 32 bytes for WGSL alignment");
 
 // Standard post-process bind group layout (group 0):
-#define PP_BINDING_SAMPLER 0        // Sampler for input texture
-#define PP_BINDING_TEXTURE 1        // Input texture (previous render pass)
-#define PP_BINDING_UNIFORMS 2       // Custom uniforms buffer
-#define PP_BINDING_EFFECT_PARAMS 3  // Effect-specific parameters
+#define PP_BINDING_SAMPLER 0       // Sampler for input texture
+#define PP_BINDING_TEXTURE 1       // Input texture (previous render pass)
+#define PP_BINDING_UNIFORMS 2      // Custom uniforms buffer
+#define PP_BINDING_EFFECT_PARAMS 3 // Effect-specific parameters
 
 // Helper to create a standard post-processing pipeline
 // Uniforms are accessible to both vertex and fragment shaders
diff --git a/src/gpu/effects/shaders.cc b/src/gpu/effects/shaders.cc
index 2e1cfe5..625c5b6 100644
--- a/src/gpu/effects/shaders.cc
+++ b/src/gpu/effects/shaders.cc
@@ -99,6 +99,28 @@ const char* chroma_aberration_shader_wgsl =
 
     SafeGetAsset(AssetId::ASSET_SHADER_CHROMA_ABERRATION);
 
+const char* gen_noise_compute_wgsl =
+
+    SafeGetAsset(AssetId::ASSET_SHADER_COMPUTE_GEN_NOISE);
+
+const char* gen_perlin_compute_wgsl =
+
+    SafeGetAsset(AssetId::ASSET_SHADER_COMPUTE_GEN_PERLIN);
+
+const char* gen_grid_compute_wgsl =
+
+    SafeGetAsset(AssetId::ASSET_SHADER_COMPUTE_GEN_GRID);
+
+#if !defined(STRIP_GPU_COMPOSITE)
+const char* gen_blend_compute_wgsl =
+
+    SafeGetAsset(AssetId::ASSET_SHADER_COMPUTE_GEN_BLEND);
+
+const char* gen_mask_compute_wgsl =
+
+    SafeGetAsset(AssetId::ASSET_SHADER_COMPUTE_GEN_MASK);
+#endif
+
 const char* vignette_shader_wgsl =
 
     SafeGetAsset(AssetId::ASSET_SHADER_VIGNETTE);
diff --git a/src/gpu/effects/shaders.h b/src/gpu/effects/shaders.h
index 50b4f32..68b8834 100644
--- a/src/gpu/effects/shaders.h
+++ b/src/gpu/effects/shaders.h
@@ -18,3 +18,10 @@ extern const char* solarize_shader_wgsl;
 extern const char* distort_shader_wgsl;
 extern const char* chroma_aberration_shader_wgsl;
 extern const char* vignette_shader_wgsl;
+extern const char* gen_noise_compute_wgsl;
+extern const char* gen_perlin_compute_wgsl;
+extern const char* gen_grid_compute_wgsl;
+#if !defined(STRIP_GPU_COMPOSITE)
+extern const char* gen_blend_compute_wgsl;
+extern const char* gen_mask_compute_wgsl;
+#endif
diff --git a/src/gpu/effects/solarize_effect.cc b/src/gpu/effects/solarize_effect.cc
index d74d708..4f47218 100644
--- a/src/gpu/effects/solarize_effect.cc
+++ b/src/gpu/effects/solarize_effect.cc
@@ -6,7 +6,6 @@
 
 // --- SolarizeEffect ---
 SolarizeEffect::SolarizeEffect(const GpuContext& ctx) : PostProcessEffect(ctx) {
-  uniforms_.init(ctx.device);
   pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format,
                                            solarize_shader_wgsl);
 }
@@ -23,6 +22,6 @@ void SolarizeEffect::render(WGPURenderPassEncoder pass, float t, float b,
   PostProcessEffect::render(pass, t, b, i, a);
 }
 void SolarizeEffect::update_bind_group(WGPUTextureView v) {
-  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, v,
-                       uniforms_.get(), {});
+  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, v, uniforms_.get(),
+                       {});
 }
diff --git a/src/gpu/effects/theme_modulation_effect.cc b/src/gpu/effects/theme_modulation_effect.cc
index f9ae636..b1eff90 100644
--- a/src/gpu/effects/theme_modulation_effect.cc
+++ b/src/gpu/effects/theme_modulation_effect.cc
@@ -6,6 +6,12 @@
 #include "gpu/effects/shaders.h"
 #include <cmath>
 
+struct ThemeModulationParams {
+  float theme_brightness;
+  float _pad[3];
+};
+static_assert(sizeof(ThemeModulationParams) == 16, "ThemeModulationParams must be 16 bytes for WGSL alignment");
+
 ThemeModulationEffect::ThemeModulationEffect(const GpuContext& ctx)
     : PostProcessEffect(ctx) {
   const char* shader_code = R"(
@@ -24,7 +30,7 @@ ThemeModulationEffect::ThemeModulationEffect(const GpuContext& ctx)
       audio_intensity: f32,
     };
 
-    struct EffectParams {
+    struct ThemeModulationParams {
       theme_brightness: f32,
       _pad0: f32,
       _pad1: f32,
@@ -34,7 +40,7 @@ ThemeModulationEffect::ThemeModulationEffect(const GpuContext& ctx)
     @group(0) @binding(0) var inputSampler: sampler;
     @group(0) @binding(1) var inputTexture: texture_2d<f32>;
     @group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
-    @group(0) @binding(3) var<uniform> params: EffectParams;
+    @group(0) @binding(3) var<uniform> params: ThemeModulationParams;
 
     @vertex
     fn vs_main(@builtin(vertex_index) vertexIndex: u32) -> VertexOutput {
@@ -61,14 +67,13 @@ ThemeModulationEffect::ThemeModulationEffect(const GpuContext& ctx)
   pipeline_ =
       create_post_process_pipeline(ctx_.device, ctx_.format, shader_code);
 
-  common_uniforms_.init(ctx_.device);
   params_buffer_ = gpu_create_buffer(
       ctx_.device, 16, WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst);
 }
 
 void ThemeModulationEffect::update_bind_group(WGPUTextureView input_view) {
   pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, input_view,
-                       common_uniforms_.get(), params_buffer_);
+                       uniforms_.get(), params_buffer_);
 }
 
 void ThemeModulationEffect::render(WGPURenderPassEncoder pass, float time,
@@ -81,7 +86,7 @@ void ThemeModulationEffect::render(WGPURenderPassEncoder pass, float time,
       .beat = beat,
       .audio_intensity = intensity,
   };
-  common_uniforms_.update(ctx_.queue, u);
+  uniforms_.update(ctx_.queue, u);
 
   // Alternate between bright and dark every 4 seconds (2 pattern changes)
   // Music patterns change every 2 seconds at 120 BPM
@@ -97,8 +102,8 @@ void ThemeModulationEffect::render(WGPURenderPassEncoder pass, float time,
       bright_value + (dark_value - bright_value) * transition;
 
   // Update params buffer
-  float params[4] = {theme_brightness, 0.0f, 0.0f, 0.0f};
-  wgpuQueueWriteBuffer(ctx_.queue, params_buffer_.buffer, 0, params,
+  ThemeModulationParams params = {theme_brightness, {0.0f, 0.0f, 0.0f}};
+  wgpuQueueWriteBuffer(ctx_.queue, params_buffer_.buffer, 0, &params,
                        sizeof(params));
 
   // Render
diff --git a/src/gpu/effects/theme_modulation_effect.h b/src/gpu/effects/theme_modulation_effect.h
index 107529b..713347b 100644
--- a/src/gpu/effects/theme_modulation_effect.h
+++ b/src/gpu/effects/theme_modulation_effect.h
@@ -5,8 +5,8 @@
 
 #pragma once
 #include "gpu/effect.h"
-#include "gpu/uniform_helper.h"
 #include "gpu/effects/post_process_helper.h"
+#include "gpu/uniform_helper.h"
 
 class ThemeModulationEffect : public PostProcessEffect {
  public:
@@ -16,6 +16,5 @@ class ThemeModulationEffect : public PostProcessEffect {
   void update_bind_group(WGPUTextureView input_view) override;
 
  private:
-  UniformBuffer<CommonPostProcessUniforms> common_uniforms_;
   GpuBuffer params_buffer_;
 };
diff --git a/src/gpu/effects/vignette_effect.cc b/src/gpu/effects/vignette_effect.cc
index a4967dd..bba0372 100644
--- a/src/gpu/effects/vignette_effect.cc
+++ b/src/gpu/effects/vignette_effect.cc
@@ -12,7 +12,6 @@ VignetteEffect::VignetteEffect(const GpuContext& ctx)
 VignetteEffect::VignetteEffect(const GpuContext& ctx,
                                const VignetteParams& params)
     : PostProcessEffect(ctx), params_(params) {
-  uniforms_.init(ctx_.device);
   params_buffer_.init(ctx_.device);
   pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format,
                                            vignette_shader_wgsl);
@@ -33,6 +32,6 @@ void VignetteEffect::render(WGPURenderPassEncoder pass, float t, float b,
 }
 
 void VignetteEffect::update_bind_group(WGPUTextureView v) {
-  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, v,
-                       uniforms_.get(), params_buffer_.get());
+  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, v, uniforms_.get(),
+                       params_buffer_.get());
 }
diff --git a/src/gpu/gpu.cc b/src/gpu/gpu.cc
index fde241d..e89a2f0 100644
--- a/src/gpu/gpu.cc
+++ b/src/gpu/gpu.cc
@@ -5,6 +5,7 @@
 #include "gpu.h"
 #include "effect.h"
 #include "gpu/effects/shaders.h"
+#include "gpu/effects/shader_composer.h"
 #include "platform/platform.h"
 
 #include <cassert>
@@ -55,10 +56,13 @@ RenderPass gpu_create_render_pass(WGPUDevice device, WGPUTextureFormat format,
                                   ResourceBinding* bindings, int num_bindings) {
   RenderPass pass = {};
 
+  // Compose shader to resolve #include directives
+  std::string composed_shader = ShaderComposer::Get().Compose({}, shader_code);
+
   // Create Shader Module
   WGPUShaderSourceWGSL wgsl_src = {};
   wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_src.code = str_view(shader_code);
+  wgsl_src.code = str_view(composed_shader.c_str());
   WGPUShaderModuleDescriptor shader_desc = {};
   shader_desc.nextInChain = &wgsl_src.chain;
   WGPUShaderModule shader_module =
@@ -156,9 +160,12 @@ ComputePass gpu_create_compute_pass(WGPUDevice device, const char* shader_code,
                                     int num_bindings) {
   ComputePass pass = {};
 
+  // Compose shader to resolve #include directives
+  std::string composed_shader = ShaderComposer::Get().Compose({}, shader_code);
+
   WGPUShaderSourceWGSL wgsl_src = {};
   wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_src.code = str_view(shader_code);
+  wgsl_src.code = str_view(composed_shader.c_str());
   WGPUShaderModuleDescriptor shader_desc = {};
   shader_desc.nextInChain = &wgsl_src.chain;
   WGPUShaderModule shader_module =
diff --git a/src/gpu/texture_manager.cc b/src/gpu/texture_manager.cc
index 0c30c94..dfa6315 100644
--- a/src/gpu/texture_manager.cc
+++ b/src/gpu/texture_manager.cc
@@ -2,7 +2,10 @@
 // It implements the TextureManager.
 
 #include "gpu/texture_manager.h"
+#include "gpu/effects/shader_composer.h"
+#include "platform/platform.h"
 #include <cstdio>
+#include <cstring>
 #include <vector>
 
 #if defined(DEMO_CROSS_COMPILE_WIN32)
@@ -26,6 +29,22 @@ void TextureManager::shutdown() {
     wgpuTextureRelease(pair.second.texture);
   }
   textures_.clear();
+
+  for (auto& pair : compute_pipelines_) {
+    if (pair.second.pipeline) {
+      wgpuComputePipelineRelease(pair.second.pipeline);
+    }
+  }
+  compute_pipelines_.clear();
+
+#if !defined(STRIP_GPU_COMPOSITE)
+  for (auto& pair : samplers_) {
+    if (pair.second) {
+      wgpuSamplerRelease(pair.second);
+    }
+  }
+  samplers_.clear();
+#endif
 }
 
 void TextureManager::create_procedural_texture(
@@ -112,3 +131,570 @@ WGPUTextureView TextureManager::get_texture_view(const std::string& name) {
   }
   return nullptr;
 }
+
+WGPUComputePipeline TextureManager::get_or_create_compute_pipeline(
+    const std::string& func_name, const char* shader_code,
+    size_t uniform_size, int num_input_textures) {
+  auto it = compute_pipelines_.find(func_name);
+  if (it != compute_pipelines_.end()) {
+    return it->second.pipeline;
+  }
+
+  // Create new pipeline
+  ShaderComposer& composer = ShaderComposer::Get();
+  std::string resolved_shader = composer.Compose({}, shader_code);
+
+  WGPUShaderSourceWGSL wgsl_src = {};
+  wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_src.code = str_view(resolved_shader.c_str());
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_src.chain;
+  WGPUShaderModule shader_module =
+      wgpuDeviceCreateShaderModule(device_, &shader_desc);
+
+  // Dynamic bind group layout
+  // Binding 0: output storage texture
+  // Binding 1: uniform buffer
+  // Binding 2 to (2 + num_input_textures - 1): input textures
+  // Binding (2 + num_input_textures): sampler (if inputs > 0)
+  const int max_entries = 2 + num_input_textures + (num_input_textures > 0 ? 1 : 0);
+  std::vector<WGPUBindGroupLayoutEntry> bgl_entries(max_entries);
+
+  // Binding 0: Output storage texture
+  bgl_entries[0].binding = 0;
+  bgl_entries[0].visibility = WGPUShaderStage_Compute;
+  bgl_entries[0].storageTexture.access = WGPUStorageTextureAccess_WriteOnly;
+  bgl_entries[0].storageTexture.format = WGPUTextureFormat_RGBA8Unorm;
+  bgl_entries[0].storageTexture.viewDimension = WGPUTextureViewDimension_2D;
+
+  // Binding 1: Uniform buffer
+  bgl_entries[1].binding = 1;
+  bgl_entries[1].visibility = WGPUShaderStage_Compute;
+  bgl_entries[1].buffer.type = WGPUBufferBindingType_Uniform;
+  bgl_entries[1].buffer.minBindingSize = uniform_size;
+
+  // Binding 2+: Input textures
+  for (int i = 0; i < num_input_textures; ++i) {
+    bgl_entries[2 + i].binding = 2 + i;
+    bgl_entries[2 + i].visibility = WGPUShaderStage_Compute;
+    bgl_entries[2 + i].texture.sampleType = WGPUTextureSampleType_Float;
+    bgl_entries[2 + i].texture.viewDimension = WGPUTextureViewDimension_2D;
+  }
+
+  // Binding N: Sampler (if inputs exist)
+  if (num_input_textures > 0) {
+    bgl_entries[2 + num_input_textures].binding = 2 + num_input_textures;
+    bgl_entries[2 + num_input_textures].visibility = WGPUShaderStage_Compute;
+    bgl_entries[2 + num_input_textures].sampler.type = WGPUSamplerBindingType_Filtering;
+  }
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = max_entries;
+  bgl_desc.entries = bgl_entries.data();
+  WGPUBindGroupLayout bind_group_layout =
+      wgpuDeviceCreateBindGroupLayout(device_, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bind_group_layout;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device_, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader_module;
+  pipeline_desc.compute.entryPoint = str_view("main");
+
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device_, &pipeline_desc);
+
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  wgpuBindGroupLayoutRelease(bind_group_layout);
+  wgpuShaderModuleRelease(shader_module);
+
+  // Cache pipeline
+  ComputePipelineInfo info = {pipeline, shader_code, uniform_size, num_input_textures};
+  compute_pipelines_[func_name] = info;
+
+  return pipeline;
+}
+
+void TextureManager::dispatch_compute(const std::string& func_name,
+                                      WGPUTexture target,
+                                      const GpuProceduralParams& params,
+                                      const void* uniform_data,
+                                      size_t uniform_size) {
+  auto it = compute_pipelines_.find(func_name);
+  if (it == compute_pipelines_.end()) {
+    return; // Pipeline not created yet
+  }
+
+  WGPUComputePipeline pipeline = it->second.pipeline;
+
+  // Create uniform buffer
+  WGPUBufferDescriptor buf_desc = {};
+  buf_desc.size = uniform_size;
+  buf_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  buf_desc.mappedAtCreation = WGPUOptionalBool_True;
+  WGPUBuffer uniform_buf = wgpuDeviceCreateBuffer(device_, &buf_desc);
+  void* mapped = wgpuBufferGetMappedRange(uniform_buf, 0, uniform_size);
+  memcpy(mapped, uniform_data, uniform_size);
+  wgpuBufferUnmap(uniform_buf);
+
+  // Create storage texture view
+  WGPUTextureViewDescriptor view_desc = {};
+  view_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  view_desc.dimension = WGPUTextureViewDimension_2D;
+  view_desc.mipLevelCount = 1;
+  view_desc.arrayLayerCount = 1;
+  WGPUTextureView target_view = wgpuTextureCreateView(target, &view_desc);
+
+  // Create bind group layout entries (must match pipeline)
+  WGPUBindGroupLayoutEntry bgl_entries[2] = {};
+  bgl_entries[0].binding = 0;
+  bgl_entries[0].visibility = WGPUShaderStage_Compute;
+  bgl_entries[0].storageTexture.access = WGPUStorageTextureAccess_WriteOnly;
+  bgl_entries[0].storageTexture.format = WGPUTextureFormat_RGBA8Unorm;
+  bgl_entries[0].storageTexture.viewDimension = WGPUTextureViewDimension_2D;
+  bgl_entries[1].binding = 1;
+  bgl_entries[1].visibility = WGPUShaderStage_Compute;
+  bgl_entries[1].buffer.type = WGPUBufferBindingType_Uniform;
+  bgl_entries[1].buffer.minBindingSize = uniform_size;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 2;
+  bgl_desc.entries = bgl_entries;
+  WGPUBindGroupLayout bind_group_layout =
+      wgpuDeviceCreateBindGroupLayout(device_, &bgl_desc);
+
+  // Create bind group
+  WGPUBindGroupEntry bg_entries[2] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].textureView = target_view;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = uniform_buf;
+  bg_entries[1].size = uniform_size;
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bind_group_layout;
+  bg_desc.entryCount = 2;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device_, &bg_desc);
+
+  // Dispatch compute
+  WGPUCommandEncoderDescriptor enc_desc = {};
+  WGPUCommandEncoder encoder =
+      wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
+  WGPUComputePassEncoder pass =
+      wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+  wgpuComputePassEncoderSetPipeline(pass, pipeline);
+  wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
+  wgpuComputePassEncoderDispatchWorkgroups(pass, (params.width + 7) / 8,
+                                           (params.height + 7) / 8, 1);
+  wgpuComputePassEncoderEnd(pass);
+
+  WGPUCommandBufferDescriptor cmd_desc = {};
+  WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
+  wgpuQueueSubmit(queue_, 1, &cmd);
+
+  // Cleanup
+  wgpuCommandBufferRelease(cmd);
+  wgpuCommandEncoderRelease(encoder);
+  wgpuComputePassEncoderRelease(pass);
+  wgpuBindGroupRelease(bind_group);
+  wgpuBindGroupLayoutRelease(bind_group_layout);
+  wgpuBufferRelease(uniform_buf);
+  wgpuTextureViewRelease(target_view);
+}
+
+void TextureManager::create_gpu_noise_texture(
+    const std::string& name, const GpuProceduralParams& params) {
+  extern const char* gen_noise_compute_wgsl;
+  get_or_create_compute_pipeline("gen_noise", gen_noise_compute_wgsl, 16);
+
+  WGPUTextureDescriptor tex_desc = {};
+  tex_desc.usage =
+      WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding;
+  tex_desc.dimension = WGPUTextureDimension_2D;
+  tex_desc.size = {(uint32_t)params.width, (uint32_t)params.height, 1};
+  tex_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  tex_desc.mipLevelCount = 1;
+  tex_desc.sampleCount = 1;
+  WGPUTexture texture = wgpuDeviceCreateTexture(device_, &tex_desc);
+
+  struct NoiseParams {
+    uint32_t width;
+    uint32_t height;
+    float seed;
+    float frequency;
+  };
+  NoiseParams uniforms = {(uint32_t)params.width, (uint32_t)params.height,
+                          params.params[0], params.params[1]};
+  dispatch_compute("gen_noise", texture, params, &uniforms, sizeof(NoiseParams));
+
+  WGPUTextureViewDescriptor view_desc = {};
+  view_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  view_desc.dimension = WGPUTextureViewDimension_2D;
+  view_desc.mipLevelCount = 1;
+  view_desc.arrayLayerCount = 1;
+  WGPUTextureView view = wgpuTextureCreateView(texture, &view_desc);
+
+  GpuTexture gpu_tex;
+  gpu_tex.texture = texture;
+  gpu_tex.view = view;
+  gpu_tex.width = params.width;
+  gpu_tex.height = params.height;
+  textures_[name] = gpu_tex;
+
+#if !defined(STRIP_ALL)
+  printf("Generated GPU noise texture: %s (%dx%d)\n", name.c_str(),
+         params.width, params.height);
+#endif
+}
+
+void TextureManager::create_gpu_perlin_texture(
+    const std::string& name, const GpuProceduralParams& params) {
+  extern const char* gen_perlin_compute_wgsl;
+  get_or_create_compute_pipeline("gen_perlin", gen_perlin_compute_wgsl, 32);
+
+  WGPUTextureDescriptor tex_desc = {};
+  tex_desc.usage =
+      WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding;
+  tex_desc.dimension = WGPUTextureDimension_2D;
+  tex_desc.size = {(uint32_t)params.width, (uint32_t)params.height, 1};
+  tex_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  tex_desc.mipLevelCount = 1;
+  tex_desc.sampleCount = 1;
+  WGPUTexture texture = wgpuDeviceCreateTexture(device_, &tex_desc);
+
+  struct PerlinParams {
+    uint32_t width;
+    uint32_t height;
+    float seed;
+    float frequency;
+    float amplitude;
+    float amplitude_decay;
+    uint32_t octaves;
+    float _pad0;
+  };
+  PerlinParams uniforms = {
+      (uint32_t)params.width,
+      (uint32_t)params.height,
+      params.params[0],
+      params.params[1],
+      params.num_params > 2 ? params.params[2] : 1.0f,
+      params.num_params > 3 ? params.params[3] : 0.5f,
+      params.num_params > 4 ? (uint32_t)params.params[4] : 4u,
+      0.0f};
+  dispatch_compute("gen_perlin", texture, params, &uniforms,
+                   sizeof(PerlinParams));
+
+  WGPUTextureViewDescriptor view_desc = {};
+  view_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  view_desc.dimension = WGPUTextureViewDimension_2D;
+  view_desc.mipLevelCount = 1;
+  view_desc.arrayLayerCount = 1;
+  WGPUTextureView view = wgpuTextureCreateView(texture, &view_desc);
+
+  GpuTexture gpu_tex;
+  gpu_tex.texture = texture;
+  gpu_tex.view = view;
+  gpu_tex.width = params.width;
+  gpu_tex.height = params.height;
+  textures_[name] = gpu_tex;
+
+#if !defined(STRIP_ALL)
+  printf("Generated GPU perlin texture: %s (%dx%d)\n", name.c_str(),
+         params.width, params.height);
+#endif
+}
+
+void TextureManager::create_gpu_grid_texture(
+    const std::string& name, const GpuProceduralParams& params) {
+  extern const char* gen_grid_compute_wgsl;
+  get_or_create_compute_pipeline("gen_grid", gen_grid_compute_wgsl, 16);
+
+  WGPUTextureDescriptor tex_desc = {};
+  tex_desc.usage =
+      WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding;
+  tex_desc.dimension = WGPUTextureDimension_2D;
+  tex_desc.size = {(uint32_t)params.width, (uint32_t)params.height, 1};
+  tex_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  tex_desc.mipLevelCount = 1;
+  tex_desc.sampleCount = 1;
+  WGPUTexture texture = wgpuDeviceCreateTexture(device_, &tex_desc);
+
+  struct GridParams {
+    uint32_t width;
+    uint32_t height;
+    uint32_t grid_size;
+    uint32_t thickness;
+  };
+  GridParams uniforms = {
+      (uint32_t)params.width, (uint32_t)params.height,
+      params.num_params > 0 ? (uint32_t)params.params[0] : 32u,
+      params.num_params > 1 ? (uint32_t)params.params[1] : 2u};
+  dispatch_compute("gen_grid", texture, params, &uniforms, sizeof(GridParams));
+
+  WGPUTextureViewDescriptor view_desc = {};
+  view_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  view_desc.dimension = WGPUTextureViewDimension_2D;
+  view_desc.mipLevelCount = 1;
+  view_desc.arrayLayerCount = 1;
+  WGPUTextureView view = wgpuTextureCreateView(texture, &view_desc);
+
+  GpuTexture gpu_tex;
+  gpu_tex.texture = texture;
+  gpu_tex.view = view;
+  gpu_tex.width = params.width;
+  gpu_tex.height = params.height;
+  textures_[name] = gpu_tex;
+
+#if !defined(STRIP_ALL)
+  printf("Generated GPU grid texture: %s (%dx%d)\n", name.c_str(),
+         params.width, params.height);
+#endif
+}
+
+#if !defined(STRIP_GPU_COMPOSITE)
+WGPUSampler TextureManager::get_or_create_sampler(SamplerType type) {
+  auto it = samplers_.find(type);
+  if (it != samplers_.end()) {
+    return it->second;
+  }
+
+  WGPUSamplerDescriptor desc = {};
+  desc.lodMinClamp = 0.0f;
+  desc.lodMaxClamp = 1.0f;
+  desc.maxAnisotropy = 1;
+
+  switch (type) {
+  case SamplerType::LinearClamp:
+    desc.addressModeU = WGPUAddressMode_ClampToEdge;
+    desc.addressModeV = WGPUAddressMode_ClampToEdge;
+    desc.magFilter = WGPUFilterMode_Linear;
+    desc.minFilter = WGPUFilterMode_Linear;
+    desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
+    break;
+  case SamplerType::LinearRepeat:
+    desc.addressModeU = WGPUAddressMode_Repeat;
+    desc.addressModeV = WGPUAddressMode_Repeat;
+    desc.magFilter = WGPUFilterMode_Linear;
+    desc.minFilter = WGPUFilterMode_Linear;
+    desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
+    break;
+  case SamplerType::NearestClamp:
+    desc.addressModeU = WGPUAddressMode_ClampToEdge;
+    desc.addressModeV = WGPUAddressMode_ClampToEdge;
+    desc.magFilter = WGPUFilterMode_Nearest;
+    desc.minFilter = WGPUFilterMode_Nearest;
+    desc.mipmapFilter = WGPUMipmapFilterMode_Nearest;
+    break;
+  case SamplerType::NearestRepeat:
+    desc.addressModeU = WGPUAddressMode_Repeat;
+    desc.addressModeV = WGPUAddressMode_Repeat;
+    desc.magFilter = WGPUFilterMode_Nearest;
+    desc.minFilter = WGPUFilterMode_Nearest;
+    desc.mipmapFilter = WGPUMipmapFilterMode_Nearest;
+    break;
+  }
+
+  WGPUSampler sampler = wgpuDeviceCreateSampler(device_, &desc);
+  samplers_[type] = sampler;
+  return sampler;
+}
+
+void TextureManager::dispatch_composite(
+    const std::string& func_name, WGPUTexture target,
+    const GpuProceduralParams& params, const void* uniform_data,
+    size_t uniform_size, const std::vector<WGPUTextureView>& input_views,
+    SamplerType sampler_type) {
+  auto it = compute_pipelines_.find(func_name);
+  if (it == compute_pipelines_.end()) {
+    return; // Pipeline not created yet
+  }
+
+  WGPUComputePipeline pipeline = it->second.pipeline;
+  int num_inputs = (int)input_views.size();
+
+  // Create uniform buffer
+  WGPUBufferDescriptor buf_desc = {};
+  buf_desc.size = uniform_size;
+  buf_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  buf_desc.mappedAtCreation = WGPUOptionalBool_True;
+  WGPUBuffer uniform_buf = wgpuDeviceCreateBuffer(device_, &buf_desc);
+  void* mapped = wgpuBufferGetMappedRange(uniform_buf, 0, uniform_size);
+  memcpy(mapped, uniform_data, uniform_size);
+  wgpuBufferUnmap(uniform_buf);
+
+  // Create storage texture view
+  WGPUTextureViewDescriptor view_desc = {};
+  view_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  view_desc.dimension = WGPUTextureViewDimension_2D;
+  view_desc.mipLevelCount = 1;
+  view_desc.arrayLayerCount = 1;
+  WGPUTextureView target_view = wgpuTextureCreateView(target, &view_desc);
+
+  // Dynamic bind group
+  const int max_entries = 2 + num_inputs + (num_inputs > 0 ? 1 : 0);
+  std::vector<WGPUBindGroupEntry> bg_entries(max_entries);
+
+  // Binding 0: Output texture
+  bg_entries[0].binding = 0;
+  bg_entries[0].textureView = target_view;
+
+  // Binding 1: Uniform buffer
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = uniform_buf;
+  bg_entries[1].size = uniform_size;
+
+  // Binding 2+: Input textures
+  for (int i = 0; i < num_inputs; ++i) {
+    bg_entries[2 + i].binding = 2 + i;
+    bg_entries[2 + i].textureView = input_views[i];
+  }
+
+  // Binding N: Sampler
+  if (num_inputs > 0) {
+    bg_entries[2 + num_inputs].binding = 2 + num_inputs;
+    bg_entries[2 + num_inputs].sampler = get_or_create_sampler(sampler_type);
+  }
+
+  // Create bind group layout (must match pipeline)
+  const int layout_entries_count = 2 + num_inputs + (num_inputs > 0 ? 1 : 0);
+  std::vector<WGPUBindGroupLayoutEntry> bgl_entries(layout_entries_count);
+
+  bgl_entries[0].binding = 0;
+  bgl_entries[0].visibility = WGPUShaderStage_Compute;
+  bgl_entries[0].storageTexture.access = WGPUStorageTextureAccess_WriteOnly;
+  bgl_entries[0].storageTexture.format = WGPUTextureFormat_RGBA8Unorm;
+  bgl_entries[0].storageTexture.viewDimension = WGPUTextureViewDimension_2D;
+
+  bgl_entries[1].binding = 1;
+  bgl_entries[1].visibility = WGPUShaderStage_Compute;
+  bgl_entries[1].buffer.type = WGPUBufferBindingType_Uniform;
+  bgl_entries[1].buffer.minBindingSize = uniform_size;
+
+  for (int i = 0; i < num_inputs; ++i) {
+    bgl_entries[2 + i].binding = 2 + i;
+    bgl_entries[2 + i].visibility = WGPUShaderStage_Compute;
+    bgl_entries[2 + i].texture.sampleType = WGPUTextureSampleType_Float;
+    bgl_entries[2 + i].texture.viewDimension = WGPUTextureViewDimension_2D;
+  }
+
+  if (num_inputs > 0) {
+    bgl_entries[2 + num_inputs].binding = 2 + num_inputs;
+    bgl_entries[2 + num_inputs].visibility = WGPUShaderStage_Compute;
+    bgl_entries[2 + num_inputs].sampler.type = WGPUSamplerBindingType_Filtering;
+  }
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = layout_entries_count;
+  bgl_desc.entries = bgl_entries.data();
+  WGPUBindGroupLayout bind_group_layout =
+      wgpuDeviceCreateBindGroupLayout(device_, &bgl_desc);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bind_group_layout;
+  bg_desc.entryCount = max_entries;
+  bg_desc.entries = bg_entries.data();
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device_, &bg_desc);
+
+  // Dispatch compute
+  WGPUCommandEncoderDescriptor enc_desc = {};
+  WGPUCommandEncoder encoder =
+      wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
+  WGPUComputePassEncoder pass =
+      wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+  wgpuComputePassEncoderSetPipeline(pass, pipeline);
+  wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
+  wgpuComputePassEncoderDispatchWorkgroups(pass, (params.width + 7) / 8,
+                                           (params.height + 7) / 8, 1);
+  wgpuComputePassEncoderEnd(pass);
+
+  WGPUCommandBufferDescriptor cmd_desc = {};
+  WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
+  wgpuQueueSubmit(queue_, 1, &cmd);
+
+  // Cleanup
+  wgpuCommandBufferRelease(cmd);
+  wgpuCommandEncoderRelease(encoder);
+  wgpuComputePassEncoderRelease(pass);
+  wgpuBindGroupRelease(bind_group);
+  wgpuBindGroupLayoutRelease(bind_group_layout);
+  wgpuBufferRelease(uniform_buf);
+  wgpuTextureViewRelease(target_view);
+}
+
+void TextureManager::create_gpu_composite_texture(
+    const std::string& name, const std::string& shader_func,
+    const char* shader_code, const void* uniform_data, size_t uniform_size,
+    int width, int height, const std::vector<std::string>& input_names,
+    SamplerType sampler) {
+  // Create pipeline if needed
+  get_or_create_compute_pipeline(shader_func, shader_code, uniform_size,
+                                 (int)input_names.size());
+
+  // Resolve input texture views
+  std::vector<WGPUTextureView> input_views;
+  input_views.reserve(input_names.size());
+  for (const auto& input_name : input_names) {
+    WGPUTextureView view = get_texture_view(input_name);
+    if (!view) {
+      fprintf(stderr, "Error: Input texture not found: %s\n",
+              input_name.c_str());
+      return;
+    }
+    input_views.push_back(view);
+  }
+
+  // Create output texture
+  WGPUTextureDescriptor tex_desc = {};
+  tex_desc.usage =
+      WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding;
+  tex_desc.dimension = WGPUTextureDimension_2D;
+  tex_desc.size = {(uint32_t)width, (uint32_t)height, 1};
+  tex_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  tex_desc.mipLevelCount = 1;
+  tex_desc.sampleCount = 1;
+  WGPUTexture texture = wgpuDeviceCreateTexture(device_, &tex_desc);
+
+  // Dispatch composite shader
+  GpuProceduralParams params = {width, height, nullptr, 0};
+  dispatch_composite(shader_func, texture, params, uniform_data, uniform_size,
+                    input_views, sampler);
+
+  // Create view
+  WGPUTextureViewDescriptor view_desc = {};
+  view_desc.format = WGPUTextureFormat_RGBA8Unorm;
+  view_desc.dimension = WGPUTextureViewDimension_2D;
+  view_desc.mipLevelCount = 1;
+  view_desc.arrayLayerCount = 1;
+  WGPUTextureView view = wgpuTextureCreateView(texture, &view_desc);
+
+  // Store
+  GpuTexture gpu_tex;
+  gpu_tex.texture = texture;
+  gpu_tex.view = view;
+  gpu_tex.width = width;
+  gpu_tex.height = height;
+  textures_[name] = gpu_tex;
+
+#if !defined(STRIP_ALL)
+  printf("Generated GPU composite texture: %s (%dx%d, %zu inputs)\n",
+         name.c_str(), width, height, input_names.size());
+#endif
+}
+#endif  // !defined(STRIP_GPU_COMPOSITE)
+
+#if !defined(STRIP_ALL)
+WGPUTextureView TextureManager::get_or_generate_gpu_texture(
+    const std::string& name, const GpuProceduralParams& params) {
+  auto it = textures_.find(name);
+  if (it != textures_.end()) {
+    return it->second.view;
+  }
+  create_gpu_noise_texture(name, params);
+  return textures_[name].view;
+}
+#endif
diff --git a/src/gpu/texture_manager.h b/src/gpu/texture_manager.h
index 23fdbe8..5a2b9f8 100644
--- a/src/gpu/texture_manager.h
+++ b/src/gpu/texture_manager.h
@@ -23,6 +23,13 @@ struct GpuTexture {
   int height;
 };
 
+struct GpuProceduralParams {
+  int width;
+  int height;
+  const float* params;
+  int num_params;
+};
+
 class TextureManager {
  public:
   void init(WGPUDevice device, WGPUQueue queue);
@@ -36,11 +43,72 @@ class TextureManager {
   void create_texture(const std::string& name, int width, int height,
                       const uint8_t* data);
 
+  // GPU procedural generation
+  void create_gpu_noise_texture(const std::string& name,
+                                const GpuProceduralParams& params);
+  void create_gpu_perlin_texture(const std::string& name,
+                                 const GpuProceduralParams& params);
+  void create_gpu_grid_texture(const std::string& name,
+                               const GpuProceduralParams& params);
+
+#if !defined(STRIP_GPU_COMPOSITE)
+  enum class SamplerType {
+    LinearClamp,
+    LinearRepeat,
+    NearestClamp,
+    NearestRepeat
+  };
+
+  // GPU composite generation (multi-input textures)
+  void create_gpu_composite_texture(const std::string& name,
+                                    const std::string& shader_func,
+                                    const char* shader_code,
+                                    const void* uniform_data,
+                                    size_t uniform_size,
+                                    int width, int height,
+                                    const std::vector<std::string>& input_names,
+                                    SamplerType sampler = SamplerType::LinearClamp);
+#endif
+
+#if !defined(STRIP_ALL)
+  // On-demand lazy generation (stripped in final builds)
+  WGPUTextureView get_or_generate_gpu_texture(const std::string& name,
+                                              const GpuProceduralParams& params);
+#endif
+
   // Retrieves a texture view by name (returns nullptr if not found)
   WGPUTextureView get_texture_view(const std::string& name);
 
  private:
+  struct ComputePipelineInfo {
+    WGPUComputePipeline pipeline;
+    const char* shader_code;
+    size_t uniform_size;
+    int num_input_textures;
+  };
+
+  WGPUComputePipeline get_or_create_compute_pipeline(const std::string& func_name,
+                                                      const char* shader_code,
+                                                      size_t uniform_size,
+                                                      int num_input_textures = 0);
+  void dispatch_compute(const std::string& func_name, WGPUTexture target,
+                       const GpuProceduralParams& params, const void* uniform_data,
+                       size_t uniform_size);
+
+#if !defined(STRIP_GPU_COMPOSITE)
+  void dispatch_composite(const std::string& func_name, WGPUTexture target,
+                         const GpuProceduralParams& params,
+                         const void* uniform_data, size_t uniform_size,
+                         const std::vector<WGPUTextureView>& input_views,
+                         SamplerType sampler_type);
+#endif
+
   WGPUDevice device_;
   WGPUQueue queue_;
   std::map<std::string, GpuTexture> textures_;
+  std::map<std::string, ComputePipelineInfo> compute_pipelines_;
+#if !defined(STRIP_GPU_COMPOSITE)
+  WGPUSampler get_or_create_sampler(SamplerType type);
+  std::map<SamplerType, WGPUSampler> samplers_;
+#endif
 };
diff --git a/src/gpu/uniform_helper.h b/src/gpu/uniform_helper.h
index 151153f..8556c98 100644
--- a/src/gpu/uniform_helper.h
+++ b/src/gpu/uniform_helper.h
@@ -5,7 +5,6 @@
 #pragma once
 
 #include "gpu/gpu.h"
-#include <cstring>
 
 // Generic uniform buffer helper
 // Usage:
diff --git a/src/main.cc b/src/main.cc
index 4c44a78..59001fb 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -160,13 +160,9 @@ int main(int argc, char** argv) {
   }
 #endif /* !defined(STRIP_ALL) */
 
-  // PRE-FILL: Fill ring buffer with initial 200ms before starting audio device
-  // This prevents underrun on first callback
-  g_audio_engine.update(g_music_time, 1.0f / 60.0f);
-  audio_render_ahead(g_music_time,
-                     1.0f / 60.0f); // Fill buffer with lookahead
+  // Pre-fill using same pattern as main loop (100ms)
+  fill_audio_buffer(0.1f, 0.0);
 
-  // Start audio (or render to WAV file)
   audio_start();
   g_last_audio_time = audio_get_playback_time(); // Initialize after start
 
diff --git a/src/test_demo.cc b/src/test_demo.cc
index a438bbc..b8e9381 100644
--- a/src/test_demo.cc
+++ b/src/test_demo.cc
@@ -32,15 +32,23 @@ class PeakMeterEffect : public PostProcessEffect {
       };
 
       struct Uniforms {
-        peak_value: f32,
+        resolution: vec2<f32>,
         _pad0: f32,
         _pad1: f32,
-        _pad2: f32,
+        aspect_ratio: f32,
+        time: f32,
+        beat: f32,
+        audio_intensity: f32,
+      };
+
+      struct EffectParams {
+        unused: f32,
       };
 
       @group(0) @binding(0) var inputSampler: sampler;
       @group(0) @binding(1) var inputTexture: texture_2d<f32>;
       @group(0) @binding(2) var<uniform> uniforms: Uniforms;
+      @group(0) @binding(3) var<uniform> params: EffectParams;
 
       @vertex
       fn vs_main(@builtin(vertex_index) vertexIndex: u32) -> VertexOutput {
@@ -69,7 +77,7 @@ class PeakMeterEffect : public PostProcessEffect {
         // Optimization: Return bar color early (avoids texture sampling for ~5% of pixels)
         if (in_bar_y && in_bar_x) {
           let uv_x = (input.uv.x - bar_x_min) / (bar_x_max - bar_x_min);
-          let factor = step(uv_x, uniforms.peak_value);
+          let factor = step(uv_x, uniforms.audio_intensity);
           return mix(vec4<f32>(0.0, 0.0, 0.0, 1.0), vec4<f32>(1.0, 0.0, 0.0,1.0), factor);
         }
 
@@ -80,24 +88,26 @@ class PeakMeterEffect : public PostProcessEffect {
 
     pipeline_ =
         create_post_process_pipeline(ctx_.device, ctx_.format, shader_code);
-    uniforms_ = gpu_create_buffer(
-        ctx_.device, 16, WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst);
   }
 
   void update_bind_group(WGPUTextureView input_view) {
     pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, input_view,
-                         uniforms_, {});
+                         uniforms_.get(), {});
   }
 
   void render(WGPURenderPassEncoder pass, float time, float beat,
               float peak_value, float aspect_ratio) {
     (void)time;
     (void)beat;
-    (void)aspect_ratio;
 
-    float uniforms[4] = {peak_value, 0.0f, 0.0f, 0.0f};
-    wgpuQueueWriteBuffer(ctx_.queue, uniforms_.buffer, 0, uniforms,
-                         sizeof(uniforms));
+    CommonPostProcessUniforms u = {
+        .resolution = {(float)width_, (float)height_},
+        .aspect_ratio = aspect_ratio,
+        .time = time,
+        .beat = beat,
+        .audio_intensity = peak_value,
+    };
+    uniforms_.update(ctx_.queue, u);
 
     wgpuRenderPassEncoderSetPipeline(pass, pipeline_);
     wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr);
@@ -209,6 +219,9 @@ int main(int argc, char** argv) {
   platform_state = platform_init(fullscreen_enabled, width, height);
   gpu_init(&platform_state);
 
+  // Load timeline from test_demo.seq
+  LoadTimeline(*gpu_get_main_sequence(), *gpu_get_context());
+
   // Add peak meter visualization effect (renders as final post-process)
 #if !defined(STRIP_ALL)
   const GpuContext* gpu_ctx = gpu_get_context();
@@ -253,9 +266,9 @@ int main(int argc, char** argv) {
     audio_render_ahead(g_music_time, audio_dt * g_tempo_scale);
   };
 
-  // Pre-fill audio buffer
-  g_audio_engine.update(g_music_time, 1.0f / 60.0f);
-  audio_render_ahead(g_music_time, 1.0f / 60.0f);
+  // Pre-fill using same pattern as main loop (100ms)
+  fill_audio_buffer(0.1f, 0.0);
+
   audio_start();
   g_last_audio_time = audio_get_playback_time();
 
diff --git a/src/tests/test_3d_render.cc b/src/tests/test_3d_render.cc
index fa13a43..eee46ba 100644
--- a/src/tests/test_3d_render.cc
+++ b/src/tests/test_3d_render.cc
@@ -220,25 +220,36 @@ int main(int argc, char** argv) {
   g_renderer.resize(platform_state.width, platform_state.height);
 
   g_textures.init(g_device, g_queue);
-  ProceduralTextureDef noise_def;
-  noise_def.width = 256;
-  noise_def.height = 256;
-  noise_def.gen_func = gen_periodic_noise;
-  noise_def.params.push_back(1234.0f);
-  noise_def.params.push_back(16.0f);
-  g_textures.create_procedural_texture("noise", noise_def);
 
+  // GPU Noise texture (replaces CPU procedural)
+  GpuProceduralParams noise_params = {};
+  noise_params.width = 256;
+  noise_params.height = 256;
+  float noise_vals[2] = {1234.0f, 16.0f};
+  noise_params.params = noise_vals;
+  noise_params.num_params = 2;
+  g_textures.create_gpu_noise_texture("noise", noise_params);
   g_renderer.set_noise_texture(g_textures.get_texture_view("noise"));
 
-  ProceduralTextureDef sky_def;
-  sky_def.width = 512;
-  sky_def.height = 256;
-  sky_def.gen_func = procedural::gen_perlin;
-  sky_def.params = {42.0f, 4.0f, 1.0f, 0.5f, 6.0f};
-  g_textures.create_procedural_texture("sky", sky_def);
-
+  // GPU Perlin texture for sky (replaces CPU procedural)
+  GpuProceduralParams sky_params = {};
+  sky_params.width = 512;
+  sky_params.height = 256;
+  float sky_vals[5] = {42.0f, 4.0f, 1.0f, 0.5f, 6.0f};
+  sky_params.params = sky_vals;
+  sky_params.num_params = 5;
+  g_textures.create_gpu_perlin_texture("sky", sky_params);
   g_renderer.set_sky_texture(g_textures.get_texture_view("sky"));
 
+  // GPU Grid texture (new!)
+  GpuProceduralParams grid_params = {};
+  grid_params.width = 256;
+  grid_params.height = 256;
+  float grid_vals[2] = {32.0f, 2.0f}; // grid_size, thickness
+  grid_params.params = grid_vals;
+  grid_params.num_params = 2;
+  g_textures.create_gpu_grid_texture("grid", grid_params);
+
   setup_scene();
 
   g_camera.position = vec3(0, 5, 10);
diff --git a/src/tests/test_demo_effects.cc b/src/tests/test_demo_effects.cc
index d0163c2..0d2b09a 100644
--- a/src/tests/test_demo_effects.cc
+++ b/src/tests/test_demo_effects.cc
@@ -197,6 +197,9 @@ static void test_effect_type_classification() {
 int main() {
   fprintf(stdout, "=== Demo Effects Tests ===\n");
 
+  extern void InitShaderComposer();
+  InitShaderComposer();
+
   test_post_process_effects();
   test_scene_effects();
   test_effect_type_classification();
diff --git a/src/tests/test_effect_base.cc b/src/tests/test_effect_base.cc
index e280e05..612e9da 100644
--- a/src/tests/test_effect_base.cc
+++ b/src/tests/test_effect_base.cc
@@ -249,6 +249,9 @@ static void test_pixel_helpers() {
 int main() {
   fprintf(stdout, "=== Effect Base Tests ===\n");
 
+  extern void InitShaderComposer();
+  InitShaderComposer();
+
   test_webgpu_fixture();
   test_offscreen_render_target();
   test_effect_construction();
diff --git a/src/tests/test_gpu_composite.cc b/src/tests/test_gpu_composite.cc
new file mode 100644
index 0000000..e5ac788
--- /dev/null
+++ b/src/tests/test_gpu_composite.cc
@@ -0,0 +1,124 @@
+// This file is part of the 64k demo project.
+// Tests GPU composite texture generation (Phase 4).
+
+#include "gpu/gpu.h"
+#include "gpu/texture_manager.h"
+#include "platform/platform.h"
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#if !defined(STRIP_GPU_COMPOSITE)
+
+int main() {
+  printf("GPU Composite Test: Starting...\n");
+
+  // Initialize GPU
+  PlatformState platform = platform_init(false, 256, 256);
+  if (!platform.window) {
+    fprintf(stderr, "Error: Failed to create window\n");
+    return 1;
+  }
+
+  gpu_init(&platform);
+  const GpuContext* ctx = gpu_get_context();
+
+  extern void InitShaderComposer();
+  InitShaderComposer();
+
+  TextureManager tex_mgr;
+  tex_mgr.init(ctx->device, ctx->queue);
+
+  // Create base textures
+  float noise_params_a[2] = {1234.0f, 4.0f};
+  GpuProceduralParams noise_a = {256, 256, noise_params_a, 2};
+  tex_mgr.create_gpu_noise_texture("noise_a", noise_a);
+
+  float noise_params_b[2] = {5678.0f, 8.0f};
+  GpuProceduralParams noise_b = {256, 256, noise_params_b, 2};
+  tex_mgr.create_gpu_noise_texture("noise_b", noise_b);
+
+  float grid_params[2] = {32.0f, 2.0f};
+  GpuProceduralParams grid = {256, 256, grid_params, 2};
+  tex_mgr.create_gpu_grid_texture("grid", grid);
+
+  printf("SUCCESS: Base textures created (noise_a, noise_b, grid)\n");
+
+  // Test blend composite
+  extern const char* gen_blend_compute_wgsl;
+  struct {
+    uint32_t width, height;
+    float blend_factor, _pad0;
+  } blend_uni = {256, 256, 0.5f, 0.0f};
+
+  std::vector<std::string> blend_inputs = {"noise_a", "noise_b"};
+  tex_mgr.create_gpu_composite_texture("blended", "gen_blend",
+                                       gen_blend_compute_wgsl, &blend_uni,
+                                       sizeof(blend_uni), 256, 256, blend_inputs);
+
+  WGPUTextureView blended_view = tex_mgr.get_texture_view("blended");
+  if (!blended_view) {
+    fprintf(stderr, "Error: Blended texture not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: Blend composite created (noise_a + noise_b)\n");
+
+  // Test mask composite
+  extern const char* gen_mask_compute_wgsl;
+  struct {
+    uint32_t width, height;
+  } mask_uni = {256, 256};
+
+  std::vector<std::string> mask_inputs = {"noise_a", "grid"};
+  tex_mgr.create_gpu_composite_texture("masked", "gen_mask", gen_mask_compute_wgsl,
+                                       &mask_uni, sizeof(mask_uni), 256, 256,
+                                       mask_inputs);
+
+  WGPUTextureView masked_view = tex_mgr.get_texture_view("masked");
+  if (!masked_view) {
+    fprintf(stderr, "Error: Masked texture not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: Mask composite created (noise_a * grid)\n");
+
+  // Test multi-stage composite (composite of composite)
+  struct {
+    uint32_t width, height;
+    float blend_factor, _pad0;
+  } blend2_uni = {256, 256, 0.7f, 0.0f};
+
+  std::vector<std::string> blend2_inputs = {"blended", "masked"};
+  tex_mgr.create_gpu_composite_texture("final", "gen_blend",
+                                       gen_blend_compute_wgsl, &blend2_uni,
+                                       sizeof(blend2_uni), 256, 256, blend2_inputs);
+
+  WGPUTextureView final_view = tex_mgr.get_texture_view("final");
+  if (!final_view) {
+    fprintf(stderr, "Error: Multi-stage composite not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: Multi-stage composite (composite of composites)\n");
+
+  // Cleanup
+  tex_mgr.shutdown();
+  gpu_shutdown();
+  platform_shutdown(&platform);
+
+  printf("All GPU composite tests passed!\n");
+  return 0;
+}
+
+#else
+
+int main() {
+  printf("GPU Composite Test: SKIPPED (STRIP_GPU_COMPOSITE defined)\n");
+  return 0;
+}
+
+#endif
diff --git a/src/tests/test_gpu_procedural.cc b/src/tests/test_gpu_procedural.cc
new file mode 100644
index 0000000..f1bade0
--- /dev/null
+++ b/src/tests/test_gpu_procedural.cc
@@ -0,0 +1,117 @@
+// This file is part of the 64k demo project.
+// Tests GPU procedural texture generation.
+
+#include "gpu/gpu.h"
+#include "gpu/texture_manager.h"
+#include "platform/platform.h"
+#include <cstdio>
+
+int main() {
+  printf("GPU Procedural Test: Starting...\n");
+
+  // Minimal GPU initialization for testing
+  PlatformState platform = platform_init(false, 256, 256);
+  if (!platform.window) {
+    fprintf(stderr, "Error: Failed to create window\n");
+    return 1;
+  }
+
+  gpu_init(&platform);
+  const GpuContext* ctx = gpu_get_context();
+
+  // Initialize shader composer (needed for #include resolution)
+  extern void InitShaderComposer();
+  InitShaderComposer();
+
+  // Create TextureManager
+  TextureManager tex_mgr;
+  tex_mgr.init(ctx->device, ctx->queue);
+
+  // Test GPU noise generation
+  GpuProceduralParams params = {};
+  params.width = 256;
+  params.height = 256;
+  float proc_params[2] = {0.0f, 4.0f}; // seed, frequency
+  params.params = proc_params;
+  params.num_params = 2;
+
+  tex_mgr.create_gpu_noise_texture("test_noise", params);
+
+  // Verify texture exists
+  WGPUTextureView view = tex_mgr.get_texture_view("test_noise");
+  if (!view) {
+    fprintf(stderr, "Error: GPU noise texture not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: GPU noise texture created (256x256)\n");
+
+  // Test pipeline caching (create second noise texture)
+  tex_mgr.create_gpu_noise_texture("test_noise_2", params);
+  WGPUTextureView view2 = tex_mgr.get_texture_view("test_noise_2");
+  if (!view2) {
+    fprintf(stderr, "Error: Second GPU noise texture not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: Pipeline caching works (second noise texture)\n");
+
+  // Test GPU perlin generation
+  float perlin_params[5] = {42.0f, 4.0f, 1.0f, 0.5f, 6.0f};
+  GpuProceduralParams perlin = {512, 256, perlin_params, 5};
+  tex_mgr.create_gpu_perlin_texture("test_perlin", perlin);
+  WGPUTextureView perlin_view = tex_mgr.get_texture_view("test_perlin");
+  if (!perlin_view) {
+    fprintf(stderr, "Error: GPU perlin texture not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: GPU perlin texture created (512x256)\n");
+
+  // Test GPU grid generation
+  float grid_params[2] = {32.0f, 2.0f};
+  GpuProceduralParams grid = {256, 256, grid_params, 2};
+  tex_mgr.create_gpu_grid_texture("test_grid", grid);
+  WGPUTextureView grid_view = tex_mgr.get_texture_view("test_grid");
+  if (!grid_view) {
+    fprintf(stderr, "Error: GPU grid texture not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: GPU grid texture created (256x256)\n");
+
+  // Test multiple pipelines coexist
+  printf("SUCCESS: All three GPU generators work (unified pipeline system)\n");
+
+  // Test variable-size textures
+  float noise_small[2] = {999.0f, 8.0f};
+  GpuProceduralParams small = {128, 64, noise_small, 2};
+  tex_mgr.create_gpu_noise_texture("noise_128x64", small);
+  if (!tex_mgr.get_texture_view("noise_128x64")) {
+    fprintf(stderr, "Error: Variable-size texture (128x64) not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+
+  float noise_large[2] = {777.0f, 2.0f};
+  GpuProceduralParams large = {1024, 512, noise_large, 2};
+  tex_mgr.create_gpu_noise_texture("noise_1024x512", large);
+  if (!tex_mgr.get_texture_view("noise_1024x512")) {
+    fprintf(stderr, "Error: Variable-size texture (1024x512) not created\n");
+    tex_mgr.shutdown();
+    gpu_shutdown();
+    return 1;
+  }
+  printf("SUCCESS: Variable-size textures work (128x64, 1024x512)\n");
+
+  // Cleanup
+  tex_mgr.shutdown();
+  gpu_shutdown();
+  platform_shutdown(&platform);
+  return 0;
+}
diff --git a/src/tests/test_post_process_helper.cc b/src/tests/test_post_process_helper.cc
index 104bbc3..36d193e 100644
--- a/src/tests/test_post_process_helper.cc
+++ b/src/tests/test_post_process_helper.cc
@@ -182,14 +182,14 @@ static void test_bind_group_update() {
 
   // Create initial bind group
   WGPUBindGroup bind_group = nullptr;
-  pp_update_bind_group(fixture.device(), pipeline, &bind_group, view1,
-                       uniforms, dummy_effect_params_buffer);
+  pp_update_bind_group(fixture.device(), pipeline, &bind_group, view1, uniforms,
+                       dummy_effect_params_buffer);
   assert(bind_group != nullptr && "Initial bind group should be created");
   fprintf(stdout, "  ✓ Initial bind group created\n");
 
   // Update bind group (should release old and create new)
-  pp_update_bind_group(fixture.device(), pipeline, &bind_group, view2,
-                       uniforms, dummy_effect_params_buffer);
+  pp_update_bind_group(fixture.device(), pipeline, &bind_group, view2, uniforms,
+                       dummy_effect_params_buffer);
   assert(bind_group != nullptr && "Updated bind group should be created");
   fprintf(stdout, "  ✓ Bind group updated successfully\n");
 
diff --git a/src/tests/test_shader_compilation.cc b/src/tests/test_shader_compilation.cc
index e2c0adc..a322e8a 100644
--- a/src/tests/test_shader_compilation.cc
+++ b/src/tests/test_shader_compilation.cc
@@ -115,16 +115,19 @@ static bool test_shader_compilation(const char* name, const char* shader_code) {
     return true; // Not a failure, just skipped
   }
 
+  // Compose shader to resolve #include directives
+  std::string composed_shader = ShaderComposer::Get().Compose({}, shader_code);
+
 #if defined(DEMO_CROSS_COMPILE_WIN32)
   WGPUShaderModuleWGSLDescriptor wgsl_desc = {};
   wgsl_desc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
-  wgsl_desc.code = shader_code;
+  wgsl_desc.code = composed_shader.c_str();
   WGPUShaderModuleDescriptor shader_desc = {};
   shader_desc.nextInChain = (const WGPUChainedStruct*)&wgsl_desc.chain;
 #else
   WGPUShaderSourceWGSL wgsl_desc = {};
   wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_desc.code = str_view(shader_code);
+  wgsl_desc.code = str_view(composed_shader.c_str());
   WGPUShaderModuleDescriptor shader_desc = {};
   shader_desc.nextInChain = (const WGPUChainedStruct*)&wgsl_desc.chain;
 #endif
diff --git a/src/util/asset_manager.h b/src/util/asset_manager.h
index 1e0638c..168bfca 100644
--- a/src/util/asset_manager.h
+++ b/src/util/asset_manager.h
@@ -10,6 +10,7 @@ struct AssetRecord {
   size_t size;         // Size of the asset data
   bool is_procedural;  // True if data was dynamically allocated by a procedural
                        // generator
+  bool is_gpu_procedural; // True if GPU compute shader generates texture
   const char* proc_func_name_str; // Name of procedural generation function
                                   // (string literal)
   const float* proc_params; // Parameters for procedural generation (static,
diff --git a/tools/asset_packer.cc b/tools/asset_packer.cc
index 0d26cf6..4aaa0e7 100644
--- a/tools/asset_packer.cc
+++ b/tools/asset_packer.cc
@@ -52,6 +52,7 @@ struct AssetBuildInfo {
   std::string name;
   std::string filename; // Original filename for static assets
   bool is_procedural;
+  bool is_gpu_procedural;
   std::string proc_func_name;     // Function name string
   std::vector<float> proc_params; // Parameters for procedural function
 
@@ -182,9 +183,64 @@ int main(int argc, char* argv[]) {
       info.params_array_name = "ASSET_PROC_PARAMS_" + info.name;
       info.func_name_str_name = "ASSET_PROC_FUNC_STR_" + info.name;
       info.is_procedural = false;
+      info.is_gpu_procedural = false;
 
-      if (compression_type_str.rfind("PROC(", 0) == 0) {
+      if (compression_type_str.rfind("PROC_GPU(", 0) == 0) {
         info.is_procedural = true;
+        info.is_gpu_procedural = true;
+        size_t open_paren = compression_type_str.find('(');
+        size_t close_paren = compression_type_str.rfind(')');
+        if (open_paren == std::string::npos ||
+            close_paren == std::string::npos) {
+          fprintf(stderr,
+                  "Error: Invalid PROC_GPU() syntax for asset: %s, string: [%s]\n",
+                  info.name.c_str(), compression_type_str.c_str());
+          return 1;
+        }
+        std::string func_and_params_str = compression_type_str.substr(
+            open_paren + 1, close_paren - open_paren - 1);
+
+        size_t params_start = func_and_params_str.find(',');
+        if (params_start != std::string::npos) {
+          std::string params_str = func_and_params_str.substr(params_start + 1);
+          info.proc_func_name = func_and_params_str.substr(0, params_start);
+
+          size_t current_pos = 0;
+          while (current_pos < params_str.length()) {
+            size_t comma_pos = params_str.find(',', current_pos);
+            std::string param_val_str =
+                (comma_pos == std::string::npos)
+                    ? params_str.substr(current_pos)
+                    : params_str.substr(current_pos, comma_pos - current_pos);
+            param_val_str.erase(0, param_val_str.find_first_not_of(" \t\r\n"));
+            param_val_str.erase(param_val_str.find_last_not_of(" \t\r\n") + 1);
+            try {
+              info.proc_params.push_back(std::stof(param_val_str));
+            } catch (...) {
+              fprintf(stderr, "Error: Invalid proc param for %s: %s\n",
+                      info.name.c_str(), param_val_str.c_str());
+              return 1;
+            }
+            if (comma_pos == std::string::npos)
+              break;
+            current_pos = comma_pos + 1;
+          }
+        } else {
+          info.proc_func_name = func_and_params_str;
+        }
+
+        // Validate GPU procedural function name
+        if (info.proc_func_name != "gen_noise" &&
+            info.proc_func_name != "gen_perlin" &&
+            info.proc_func_name != "gen_grid") {
+          fprintf(stderr,
+                  "Error: PROC_GPU only supports gen_noise, gen_perlin, gen_grid, got: %s for asset: %s\n",
+                  info.proc_func_name.c_str(), info.name.c_str());
+          return 1;
+        }
+      } else if (compression_type_str.rfind("PROC(", 0) == 0) {
+        info.is_procedural = true;
+        info.is_gpu_procedural = false;
         size_t open_paren = compression_type_str.find('(');
         size_t close_paren = compression_type_str.rfind(')');
         if (open_paren == std::string::npos ||
@@ -500,12 +556,13 @@ int main(int argc, char* argv[]) {
   for (const auto& info : asset_build_infos) {
     fprintf(assets_data_cc_file, "    { ");
     if (info.is_procedural) {
-      fprintf(assets_data_cc_file, "nullptr, 0, true, %s, %s, %zu",
+      fprintf(assets_data_cc_file, "nullptr, 0, true, %s, %s, %s, %zu",
+              info.is_gpu_procedural ? "true" : "false",
               info.func_name_str_name.c_str(), info.params_array_name.c_str(),
               info.proc_params.size());
     } else {
       fprintf(assets_data_cc_file,
-              "%s, ASSET_SIZE_%s, false, nullptr, nullptr, 0",
+              "%s, ASSET_SIZE_%s, false, false, nullptr, nullptr, 0",
               info.data_array_name.c_str(), info.name.c_str());
     }
     fprintf(assets_data_cc_file, " },\n");
diff --git a/tools/timeline_editor/index.html b/tools/timeline_editor/index.html
index 074b711..db71beb 100644
--- a/tools/timeline_editor/index.html
+++ b/tools/timeline_editor/index.html
@@ -601,7 +601,11 @@
                     const modifier = effect.priorityModifier || '+';
                     output += `  EFFECT ${modifier} ${effect.className} ${effect.startTime.toFixed(2)} ${effect.endTime.toFixed(2)}`;
                     if (effect.args) {
-                        output += ` ${effect.args}`;
+                        // Strip priority comments from args
+                        const cleanArgs = effect.args.replace(/\s*#\s*Priority:\s*\d+/i, '').trim();
+                        if (cleanArgs) {
+                            output += ` ${cleanArgs}`;
+                        }
                     }
                     output += '\n';
                 }
diff --git a/tools/validate_uniforms.py b/tools/validate_uniforms.py
new file mode 100644
index 0000000..40d1b0f
--- /dev/null
+++ b/tools/validate_uniforms.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+
+import sys
+import re
+import os
+
+# WGSL alignment rules (simplified for common types)
+WGSL_ALIGNMENT = {
+    "f32": 4,
+    "vec2<f32>": 8,
+    "vec3<f32>": 16,
+    "vec4<f32>": 16,
+    # Add other types as needed (e.g., u32, i32, mat4x4<f32>)
+}
+
+def get_wgsl_type_size_and_alignment(type_name):
+    type_name = type_name.strip()
+    if type_name in WGSL_ALIGNMENT:
+        return WGSL_ALIGNMENT[type_name], WGSL_ALIGNMENT[type_name]
+    # Handle arrays, e.g., array<f32, 5>
+    if type_name.startswith("array"):
+        match = re.search(r"array<([\w<>, ]+)>", type_name)
+        if match:
+            inner_type = match.group(1).split(",")[0].strip()
+            # For simplicity, assume scalar array doesn't change alignment of base type
+            return get_wgsl_type_size_and_alignment(inner_type)
+    # Handle structs recursively (simplified, assumes no nested structs for now)
+    return 0, 0 # Unknown or complex type
+
+def parse_wgsl_struct(wgsl_content):
+    structs = {}
+    # Regex to find struct definitions: struct StructName { ... }
+    struct_matches = re.finditer(r"struct\s+(\w+)\s*\{\s*(.*?)\s*\}", wgsl_content, re.DOTALL)
+    for struct_match in struct_matches:
+        struct_name = struct_match.group(1)
+        members_content = struct_match.group(2)
+        members = []
+        # Regex to find members: member_name: member_type
+        # Adjusted regex to handle types with brackets and spaces, and comments.
+        # CHANGED: \s to [ \t] to avoid consuming newlines
+        member_matches = re.finditer(r"(\w+)\s*:\s*([\w<>,\[\] \t]+)(?:\s*//.*)?", members_content)
+        for member_match in member_matches:
+            member_name = member_match.group(1)
+            member_type = member_match.group(2).strip()
+            if member_type.endswith(','):
+                member_type = member_type[:-1].strip()
+            members.append((member_name, member_type))
+        structs[struct_name] = members
+        # print(f"DEBUG: Parsed WGSL struct '{struct_name}' with members: {members}")
+    return structs
+
+def find_embedded_wgsl_in_cpp(cpp_content):
+    # Regex to find raw string literals R"(...)" which often contain WGSL
+    wgsl_blocks = []
+    matches = re.finditer(r'R"\((.*?)\)"', cpp_content, re.DOTALL)
+    for match in matches:
+        wgsl_blocks.append(match.group(1))
+    return wgsl_blocks
+
+def calculate_wgsl_struct_size(struct_name, struct_members):
+    total_size = 0
+    max_alignment = 0
+    members_info = []
+
+    for member_name, member_type in struct_members:
+        size, alignment = get_wgsl_type_size_and_alignment(member_type)
+        if size == 0: # If type is unknown or complex, we can't reliably calculate
+            # print(f"Warning: Unknown or complex WGSL type '{member_type}' for member '{member_name}'. Cannot reliably calculate size.", file=sys.stderr)
+            return 0, 0
+        members_info.append((member_name, member_type, size, alignment))
+        max_alignment = max(max_alignment, alignment)
+
+    current_offset = 0
+    for member_name, member_type, size, alignment in members_info:
+        # Align current offset to the alignment of the current member
+        current_offset = (current_offset + alignment - 1) & ~(alignment - 1)
+        current_offset += size
+
+    # The total size of the struct is the final offset, padded to the max alignment
+    if max_alignment > 0:
+        total_size = (current_offset + max_alignment - 1) & ~(max_alignment - 1)
+    else:
+        total_size = current_offset
+
+    return total_size, max_alignment
+
+def parse_cpp_static_asserts(cpp_content):
+    cpp_structs = {}
+    # Regex to find C++ struct definitions with static_asserts for sizeof
+    # This regex is simplified and might need adjustments for more complex C++ code
+    struct_matches = re.finditer(r"struct\s+(\w+)\s*\{\s*(.*?)\s*\}\s*;.*?static_assert\(sizeof\(\1\)\s*==\s*(\d+)\s*,.*?\);", cpp_content, re.DOTALL | re.MULTILINE)
+    for struct_match in struct_matches:
+        struct_name = struct_match.group(1)
+        members_content = struct_match.group(2)
+        expected_size = int(struct_match.group(3))
+        members = []
+        # Regex to find members: type member_name;
+        member_matches = re.finditer(r"(.*?)\s+(\w+)\s*(?:=\s*.*?|\s*\{.*?\})?;", members_content)
+        for member_match in member_matches:
+            member_type = member_match.group(1).strip()
+            member_name = member_match.group(2).strip()
+            members.append((member_name, member_type))
+        cpp_structs[struct_name] = {"members": members, "expected_size": expected_size}
+    return cpp_structs
+
+def validate_uniforms(wgsl_files, cpp_files):
+    all_wgsl_structs = {}
+    
+    # Parse separate WGSL files
+    for file_path in wgsl_files:
+        try:
+            with open(file_path, 'r') as f:
+                wgsl_content = f.read()
+                structs = parse_wgsl_struct(wgsl_content)
+                all_wgsl_structs.update(structs)
+        except Exception as e:
+            print(f"Error parsing WGSL file {file_path}: {e}", file=sys.stderr)
+            continue
+
+    # Parse C++ files for embedded WGSL and static_asserts
+    for cpp_file_path in cpp_files:
+        try:
+            with open(cpp_file_path, 'r') as f:
+                cpp_content = f.read()
+                
+                # Parse embedded WGSL
+                wgsl_blocks = find_embedded_wgsl_in_cpp(cpp_content)
+                for block in wgsl_blocks:
+                    structs = parse_wgsl_struct(block)
+                    all_wgsl_structs.update(structs)
+
+                # Parse C++ structs and static_asserts
+                cpp_structs = parse_cpp_static_asserts(cpp_content)
+                for struct_name, data in cpp_structs.items():
+                    expected_size = data["expected_size"]
+                    # Try to find the matching WGSL struct
+                    if struct_name in all_wgsl_structs:
+                        wgsl_members = all_wgsl_structs[struct_name]
+                        calculated_wgsl_size, wgsl_max_alignment = calculate_wgsl_struct_size(struct_name, wgsl_members)
+
+                        if calculated_wgsl_size == 0: # If calculation failed
+                            # print(f"Validation Warning for '{struct_name}': Could not calculate WGSL size.")
+                            continue
+
+                        if calculated_wgsl_size != expected_size:
+                            print(f"Validation Mismatch for '{struct_name}':\n  WGSL Calculated Size: {calculated_wgsl_size}\n  C++ Expected Size: {expected_size}\n  Max WGSL Alignment: {wgsl_max_alignment}", file=sys.stderr)
+                            sys.exit(1)
+                        else:
+                            print(f"Validation OK for '{struct_name}': Size {calculated_wgsl_size} matches C++ expected size.")
+                    else:
+                        print(f"Validation Warning for '{struct_name}': Matching WGSL struct not found.")
+        except Exception as e:
+            print(f"Error processing C++ file {cpp_file_path}: {e}", file=sys.stderr)
+            continue
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: validate_uniforms.py <wgsl_dir_or_file> <cpp_file1> [<cpp_file2> ...]", file=sys.stderr)
+        sys.exit(1)
+
+    wgsl_input = sys.argv[1]
+    cpp_files = sys.argv[2:]
+
+    wgsl_files = []
+    if os.path.isfile(wgsl_input):
+        wgsl_files.append(wgsl_input)
+    elif os.path.isdir(wgsl_input):
+        for root, _, files in os.walk(wgsl_input):
+            for file in files:
+                if file.endswith(".wgsl"):
+                    wgsl_files.append(os.path.join(root, file))
+    
+    # We proceed even if wgsl_files is empty, because C++ files might contain embedded WGSL
+
+    validate_uniforms(wgsl_files, cpp_files)
+
+if __name__ == "__main__":
+    main()
+\ No newline at end of file