38 files changed, 2595 insertions, 578 deletions
diff --git a/CLAUDE.md b/CLAUDE.md
index a52dfce..714339b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -11,6 +11,7 @@
 @doc/HOWTO.md
 @doc/CONTRIBUTING.md
 @doc/AI_RULES.md
+@doc/EFFECT_WORKFLOW.md
 
 # ============================================
 # TIER 3: DESIGN DOCS (Load On-Demand)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97b371a..48a46e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -149,6 +149,8 @@ if (DEMO_HEADLESS)
         src/gpu/effects/particle_spray_effect.cc
         src/gpu/effects/gaussian_blur_effect.cc
         src/gpu/effects/solarize_effect.cc
+# Disabled: src/gpu/effects/cube_sphere_effect.cc (incomplete conversion)
+        src/gpu/effects/scene1_effect.cc
         src/gpu/effects/chroma_aberration_effect.cc
         src/gpu/effects/vignette_effect.cc
         src/gpu/effects/cnn_effect.cc
@@ -179,6 +181,8 @@ else()
         src/gpu/effects/particle_spray_effect.cc
         src/gpu/effects/gaussian_blur_effect.cc
         src/gpu/effects/solarize_effect.cc
+# Disabled: src/gpu/effects/cube_sphere_effect.cc (incomplete conversion)
+        src/gpu/effects/scene1_effect.cc
         src/gpu/effects/chroma_aberration_effect.cc
         src/gpu/effects/vignette_effect.cc
         src/gpu/effects/cnn_effect.cc
diff --git a/README.md b/README.md
index f7d78de..ac723db 100644
--- a/README.md
+++ b/README.md
@@ -16,5 +16,6 @@ cmake --build build -j4
 - **TODO.md** - Active tasks and priorities
 - **doc/HOWTO.md** - Common operations (building, testing, assets)
 - **doc/CONTRIBUTING.md** - Development guidelines and protocols
+- **doc/EFFECT_WORKFLOW.md** - Step-by-step guide for adding visual effects
 
 See `doc/` for detailed technical documentation.
diff --git a/doc/AI_RULES.md b/doc/AI_RULES.md
index d18a0cc..1a4ee78 100644
--- a/doc/AI_RULES.md
+++ b/doc/AI_RULES.md
@@ -5,3 +5,22 @@
 - Prefer small, reviewable commits
 - All `cmake --build` commands must use the `-j4` option for parallel building.
 - after a task, a 'big' final commit should contain a short handoff tag like "handoff(Gemini):..." if you're gemini-cli, or "handoff(Claude): ..." if you're claude-code.
+
+## Adding Visual Effects
+
+**IMPORTANT:** When adding new visual effects, follow the complete workflow in `doc/EFFECT_WORKFLOW.md`.
+
+**Required steps (must complete ALL):**
+1. Create effect files (.h, .cc, .wgsl)
+2. Add shader to `workspaces/main/assets.txt`
+3. Add `.cc` to CMakeLists.txt GPU_SOURCES (BOTH sections: headless and normal)
+4. Include header in `src/gpu/demo_effects.h`
+5. Add to timeline with `EFFECT +` (priority modifier is REQUIRED)
+6. Add to test list in `src/tests/gpu/test_demo_effects.cc`
+7. Build and verify: `cmake --build build -j4 && cd build && ./test_demo_effects`
+
+**Common mistakes to avoid:**
+- Missing priority modifier in timeline (`EFFECT` must be `EFFECT +`, `EFFECT =`, or `EFFECT -`)
+- Adding `.cc` to only one CMakeLists.txt section (need BOTH headless and normal)
+- Wrong asset ID (check assets.txt entry name → `ASSET_SHADER_<NAME>`)
+- Forgetting to add to test file
diff --git a/doc/CNN_EFFECT.md b/doc/CNN_EFFECT.md
index ae0f38a..4659fd3 100644
--- a/doc/CNN_EFFECT.md
+++ b/doc/CNN_EFFECT.md
@@ -21,27 +21,46 @@ Trainable convolutional neural network layers for artistic stylization (painterl
 
 ## Architecture
 
-### Coordinate-Aware Layer 0
+### RGBD → Grayscale Pipeline
 
-Layer 0 accepts normalized (x,y) patch center coordinates alongside RGBA samples:
+**Input:** RGBD (RGB + inverse depth D=1/z)
+**Output:** Grayscale (1 channel)
+**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1]
+
+**Architecture:**
+- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD
+- **Final layer (N-1):** Conv2d(7→1) - output grayscale
 
 ```wgsl
-fn cnn_conv3x3_with_coord(
+// Inner layers: 7→4 (RGBD output)
+fn cnn_conv3x3_7to4(
   tex: texture_2d<f32>,
   samp: sampler,
-  uv: vec2<f32>,                          # Center position [0,1]
+  uv: vec2<f32>,
   resolution: vec2<f32>,
-  rgba_weights: array<mat4x4<f32>, 9>,    # 9 samples × 4×4 matrix
-  coord_weights: mat2x4<f32>,             # 2 coords → 4 outputs
-  bias: vec4<f32>
+  original: vec4<f32>,                     # Original RGBD [-1,1]
+  weights: array<array<f32, 8>, 36>       # 9 pos × 4 out × (7 weights + bias)
 ) -> vec4<f32>
-```
 
-**Input structure:** 9 RGBA samples (36 values) + 1 xy coordinate (2 values) = 38 inputs → 4 outputs
+// Final layer: 7→1 (grayscale output)
+fn cnn_conv3x3_7to1(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 9>        # 9 pos × (7 weights + bias)
+) -> f32
+```
 
-**Size impact:** +32B coord weights, kernel-agnostic
+**Input normalization:**
+- **fs_main** normalizes textures once: `(tex - 0.5) * 2` → [-1,1]
+- **Conv functions** normalize UV coords: `(uv - 0.5) * 2` → [-1,1]
+- **Grayscale** computed from normalized RGBD: `0.2126*R + 0.7152*G + 0.0722*B`
+- **Inter-layer data** stays in [-1,1] (no denormalization)
+- **Final output** denormalized for display: `(result + 1.0) * 0.5` → [0,1]
 
-**Use cases:** Position-dependent stylization (vignettes, corner darkening, radial gradients)
+**Activation:** tanh for inner layers (output stays [-1,1]), none for final layer
 
 ### Multi-Layer Architecture
 
@@ -80,18 +99,15 @@ workspaces/main/shaders/cnn/
 ### 1. Prepare Training Data
 
 Collect input/target image pairs:
-- **Input:** Raw 3D render
-- **Target:** Artistic style (hand-painted, filtered, stylized)
+- **Input:** RGBA (RGB + depth as alpha channel, D=1/z)
+- **Target:** Grayscale stylized output
 
 ```bash
-training/input/img_000.png   # Raw render
-training/output/img_000.png  # Stylized target
+training/input/img_000.png   # RGBA render (RGB + depth)
+training/output/img_000.png  # Grayscale target
 ```
 
-Use `image_style_processor.py` to generate targets:
-```bash
-python3 training/image_style_processor.py input/ output/ pencil_sketch
-```
+**Note:** Input images must be RGBA where alpha = inverse depth (1/z)
 
 ### 2. Train Network
 
@@ -135,6 +151,14 @@ python3 training/train_cnn.py \
   --output workspaces/main/shaders/cnn/cnn_weights_generated.wgsl
 ```
 
+**Generate ground truth (for shader validation):**
+```bash
+python3 training/train_cnn.py \
+  --infer training/input/img_000.png \
+  --export-only training/checkpoints/checkpoint_epoch_200.pth \
+  --output training/ground_truth.png
+```
+
 ### 3. Rebuild Demo
 
 Training script auto-generates both `cnn_weights_generated.wgsl` and `cnn_layer.wgsl`:
@@ -245,20 +269,25 @@ Expands to:
 
 **Weight Storage:**
 
-**Layer 0 (coordinate-aware):**
+**Inner layers (7→4 RGBD output):**
 ```wgsl
-const rgba_weights_layer0: array<mat4x4<f32>, 9> = array(...);
-const coord_weights_layer0 = mat2x4<f32>(
-  0.1, -0.2, 0.0, 0.0,  # x-coord weights
-  -0.1, 0.0, 0.2, 0.0   # y-coord weights
+// Structure: array<array<f32, 8>, 36>
+// 9 positions × 4 output channels, each with 7 weights + bias
+const weights_layer0: array<array<f32, 8>, 36> = array(
+  array<f32, 8>(w0_r, w0_g, w0_b, w0_d, w0_u, w0_v, w0_gray, bias0),  // pos0_ch0
+  array<f32, 8>(w1_r, w1_g, w1_b, w1_d, w1_u, w1_v, w1_gray, bias1),  // pos0_ch1
+  // ... 34 more entries
 );
-const bias_layer0 = vec4<f32>(0.0, 0.0, 0.0, 0.0);
 ```
 
-**Layers 1+ (standard):**
+**Final layer (7→1 grayscale output):**
 ```wgsl
-const weights_layer1: array<mat4x4<f32>, 9> = array(...);
-const bias_layer1 = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+// Structure: array<array<f32, 8>, 9>
+// 9 positions, each with 7 weights + bias
+const weights_layerN: array<array<f32, 8>, 9> = array(
+  array<f32, 8>(w0_r, w0_g, w0_b, w0_d, w0_u, w0_v, w0_gray, bias0),  // pos0
+  // ... 8 more entries
+);
 ```
 
 ---
diff --git a/doc/CNN_RGBD_GRAYSCALE_SUMMARY.md b/doc/CNN_RGBD_GRAYSCALE_SUMMARY.md
new file mode 100644
index 0000000..4c13693
--- /dev/null
+++ b/doc/CNN_RGBD_GRAYSCALE_SUMMARY.md
@@ -0,0 +1,134 @@
+# CNN RGBD→Grayscale Architecture Implementation
+
+## Summary
+
+Implemented CNN architecture upgrade: RGBD input → grayscale output with 7-channel augmented input.
+
+## Changes Made
+
+### Architecture
+
+**Input:** RGBD (4 channels: RGB + inverse depth D=1/z)
+**Output:** Grayscale (1 channel)
+**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1]
+
+**Layer Configuration:**
+- Inner layers (0..N-2): Conv2d(7→4) - output RGBD with tanh activation
+- Final layer (N-1): Conv2d(7→1) - output grayscale, no activation
+
+### Input Normalization (all to [-1,1])
+
+- **RGBD:** `(rgbd - 0.5) * 2`
+- **UV coords:** `(uv - 0.5) * 2`
+- **Grayscale:** `(0.2126*R + 0.7152*G + 0.0722*B - 0.5) * 2`
+
+**Rationale:** Zero-centered inputs for tanh activation, better gradient flow.
+
+### Modified Files
+
+**Training (`/Users/skal/demo/training/train_cnn.py`):**
+1. Removed `CoordConv2d` class
+2. Updated `SimpleCNN`:
+   - Inner layers: `Conv2d(7, 4)` - RGBD output
+   - Final layer: `Conv2d(7, 1)` - grayscale output
+3. Updated `forward()`:
+   - Normalize RGBD/coords/gray to [-1,1]
+   - Concatenate 7-channel input for each layer
+   - Apply tanh (inner) or none (final)
+   - Denormalize final output
+4. Updated `export_weights_to_wgsl()`:
+   - Inner: `array<array<f32, 8>, 36>` (9 pos × 4 ch × 8 values)
+   - Final: `array<array<f32, 8>, 9>` (9 pos × 8 values)
+5. Updated `generate_layer_shader()`:
+   - Use `cnn_conv3x3_7to4` for inner layers
+   - Use `cnn_conv3x3_7to1` for final layer
+   - Denormalize outputs from [-1,1] to [0,1]
+6. Updated `ImagePairDataset`:
+   - Load RGBA input (was RGB)
+
+**Shaders (`/Users/skal/demo/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl`):**
+1. Added `cnn_conv3x3_7to4()`:
+   - 7-channel input: [RGBD, uv_x, uv_y, gray]
+   - 4-channel output: RGBD
+   - Weights: `array<array<f32, 8>, 36>`
+2. Added `cnn_conv3x3_7to1()`:
+   - 7-channel input: [RGBD, uv_x, uv_y, gray]
+   - 1-channel output: grayscale
+   - Weights: `array<array<f32, 8>, 9>`
+
+**Documentation (`/Users/skal/demo/doc/CNN_EFFECT.md`):**
+1. Updated architecture section with RGBD→grayscale pipeline
+2. Updated training data requirements (RGBA input)
+3. Updated weight storage format
+
+### No C++ Changes
+
+CNNLayerParams and bind groups remain unchanged.
+
+## Data Flow
+
+1. Layer 0 captures original RGBD to `captured_frame`
+2. Each layer:
+   - Samples previous layer output (RGBD in [0,1])
+   - Normalizes RGBD to [-1,1]
+   - Computes UV coords and grayscale, normalizes to [-1,1]
+   - Concatenates 7-channel input
+   - Applies convolution with layer-specific weights
+   - Outputs RGBD (inner) or grayscale (final) in [-1,1]
+   - Applies tanh (inner only)
+   - Denormalizes to [0,1] for texture storage
+   - Blends with original
+
+## Next Steps
+
+1. **Prepare RGBD training data:**
+   - Input: RGBA images (RGB + depth in alpha)
+   - Target: Grayscale stylized output
+
+2. **Train network:**
+   ```bash
+   python3 training/train_cnn.py \
+     --input training/input \
+     --target training/output \
+     --layers 3 \
+     --epochs 1000
+   ```
+
+3. **Verify generated shaders:**
+   - Check `cnn_weights_generated.wgsl` structure
+   - Check `cnn_layer.wgsl` uses new conv functions
+
+4. **Test in demo:**
+   ```bash
+   cmake --build build -j4
+   ./build/demo64k
+   ```
+
+## Design Rationale
+
+**Why [-1,1] normalization?**
+- Centered inputs for tanh (operates best around 0)
+- Better gradient flow
+- Standard ML practice for normalized data
+
+**Why RGBD throughout vs RGB?**
+- Depth information propagates through network
+- Enables depth-aware stylization
+- Consistent 4-channel processing
+
+**Why 7-channel input?**
+- Coordinates: position-dependent effects (vignettes)
+- Grayscale: luminance-aware processing
+- RGBD: full color+depth information
+- Enables richer feature learning
+
+## Testing Checklist
+
+- [ ] Train network with RGBD input data
+- [ ] Verify `cnn_weights_generated.wgsl` structure
+- [ ] Verify `cnn_layer.wgsl` uses `7to4`/`7to1` functions
+- [ ] Build demo without errors
+- [ ] Visual test: inner layers show RGBD evolution
+- [ ] Visual test: final layer produces grayscale
+- [ ] Visual test: blending works correctly
+- [ ] Compare quality with previous RGB→RGB architecture
diff --git a/doc/COMPLETED.md b/doc/COMPLETED.md
index d1c89af..2336f62 100644
--- a/doc/COMPLETED.md
+++ b/doc/COMPLETED.md
@@ -29,6 +29,22 @@ Detailed historical documents have been moved to `doc/archive/` for reference:
 
 Use `read @doc/archive/FILENAME.md` to access archived documents.
 
+## Recently Completed (February 10, 2026)
+
+- [x] **WGPU Boilerplate Factorization**
+    - **Goal**: Reduce repetitive WGPU code via builder pattern helpers
+    - **Implementation**:
+      - Created `BindGroupLayoutBuilder` and `BindGroupBuilder` for declarative bind group creation
+      - Created `RenderPipelineBuilder` to simplify pipeline setup with ShaderComposer integration
+      - Created `SamplerCache` singleton to deduplicate sampler instances
+      - Refactored `post_process_helper.cc`, `cnn_effect.cc`, `rotating_cube_effect.cc`
+    - **Result**:
+      - Bind group creation: 19 instances reduced from 14→4 lines each
+      - Pipeline creation: 30-50 lines reduced to 8 lines
+      - Sampler deduplication: 6 instances → cached
+      - Total: -122 lines boilerplate, binary size unchanged (6.3M debug)
+      - Tests pass, prevents binding index errors
+
 ## Recently Completed (February 9, 2026)
 
 - [x] **External Library Size Measurement (Task #76)**
diff --git a/doc/CONTRIBUTING.md b/doc/CONTRIBUTING.md
index 9cd785b..98df873 100644
--- a/doc/CONTRIBUTING.md
+++ b/doc/CONTRIBUTING.md
@@ -65,12 +65,15 @@ See `doc/CODING_STYLE.md` for detailed examples.
 ## Development Protocols
 
 ### Adding Visual Effect
-1. Implement `Effect` subclass in `src/gpu/demo_effects.cc`
-2. Add to workspace `timeline.seq` (e.g., `workspaces/main/timeline.seq`)
-3. **Update `test_demo_effects.cc`**:
-   - Add to test list
-   - Increment `EXPECTED_*_COUNT`
-4. Verify:
+1. Create effect class files (use `tools/shadertoy/convert_shadertoy.py` or templates)
+2. Add shader to `workspaces/main/assets.txt`
+3. Add effect `.cc` file to `CMakeLists.txt` GPU_SOURCES (both sections)
+4. Include header in `src/gpu/demo_effects.h`
+5. Add to workspace `timeline.seq` (e.g., `workspaces/main/timeline.seq`)
+6. **Update `src/tests/gpu/test_demo_effects.cc`**:
+   - Add to `post_process_effects` list (lines 80-93) or `scene_effects` list (lines 125-137)
+   - Example: `{"MyEffect", std::make_shared<MyEffect>(fixture.ctx())},`
+7. Verify:
 ```bash
 cmake -S . -B build -DDEMO_BUILD_TESTS=ON
 cmake --build build -j4 --target test_demo_effects
diff --git a/doc/EFFECT_WORKFLOW.md b/doc/EFFECT_WORKFLOW.md
new file mode 100644
index 0000000..45c47b7
--- /dev/null
+++ b/doc/EFFECT_WORKFLOW.md
@@ -0,0 +1,228 @@
+# Effect Creation Workflow
+
+**Target Audience:** AI coding agents and developers
+
+Automated checklist for adding new visual effects to the demo.
+
+---
+
+## Quick Reference
+
+**For ShaderToy conversions:** Use `tools/shadertoy/convert_shadertoy.py` then follow steps 3-8 below.
+
+**For custom effects:** Follow all steps 1-8.
+
+---
+
+## Step-by-Step Workflow
+
+### 1. Create Effect Files
+
+**Location:**
+- Header: `src/gpu/effects/<effect_name>_effect.h`
+- Implementation: `src/gpu/effects/<effect_name>_effect.cc`
+- Shader: `workspaces/main/shaders/<effect_name>.wgsl`
+
+**Naming Convention:**
+- Class name: `<EffectName>Effect` (e.g., `TunnelEffect`, `PlasmaEffect`)
+- Files: `<effect_name>_effect.*` (snake_case)
+
+**Base Class:**
+- Post-process effects: inherit from `PostProcessEffect`
+- Scene effects: inherit from `Effect`
+
+**Template:** See `tools/shadertoy/template.*` or use `convert_shadertoy.py`
+
+### 2. Add Shader to Assets
+
+**File:** `workspaces/main/assets.txt`
+
+**Format:**
+```
+SHADER_<UPPER_SNAKE_NAME>, NONE, shaders/<effect_name>.wgsl, "Effect description"
+```
+
+**Example:**
+```
+SHADER_TUNNEL, NONE, shaders/tunnel.wgsl, "Tunnel effect shader"
+```
+
+**Asset ID:** Will be `AssetId::ASSET_SHADER_<UPPER_SNAKE_NAME>` in C++
+
+### 3. Add to CMakeLists.txt
+
+**File:** `CMakeLists.txt`
+
+**Action:** Add `src/gpu/effects/<effect_name>_effect.cc` to **BOTH** GPU_SOURCES sections:
+- Headless mode section (around line 141-167)
+- Normal mode section (around line 171-197)
+
+**Location:** After similar effects (post-process with post-process, scene with scene)
+
+**Example:**
+```cmake
+# In headless section (line ~152):
+        src/gpu/effects/solarize_effect.cc
+        src/gpu/effects/tunnel_effect.cc        # <-- Add here
+        src/gpu/effects/chroma_aberration_effect.cc
+
+# In normal section (line ~183):
+        src/gpu/effects/solarize_effect.cc
+        src/gpu/effects/tunnel_effect.cc        # <-- Add here
+        src/gpu/effects/chroma_aberration_effect.cc
+```
+
+### 4. Include in demo_effects.h
+
+**File:** `src/gpu/demo_effects.h`
+
+**Action:** Add include directive:
+```cpp
+#include "gpu/effects/<effect_name>_effect.h"
+```
+
+**Location:** Alphabetically with other effect includes
+
+### 5. Add to Timeline
+
+**File:** `workspaces/main/timeline.seq`
+
+**Format:**
+```
+SEQUENCE <start_time> <priority>
+  EFFECT <+|=|-> <EffectName>Effect <local_start> <local_end> [params...]
+```
+
+**Priority Modifiers (REQUIRED):**
+- `+` : Increment priority
+- `=` : Same priority as previous effect
+- `-` : Decrement priority (for backgrounds)
+
+**Example:**
+```
+SEQUENCE 0.0 0
+  EFFECT + TunnelEffect 0.0 10.0
+```
+
+**Common Mistake:** Missing priority modifier (`+`, `=`, `-`) after EFFECT keyword
+
+### 6. Update Tests
+
+**File:** `src/tests/gpu/test_demo_effects.cc`
+
+**Action:** Add effect to appropriate list:
+
+**Post-Process Effects (lines 80-93):**
+```cpp
+{"TunnelEffect", std::make_shared<TunnelEffect>(fixture.ctx())},
+```
+
+**Scene Effects (lines 125-137):**
+```cpp
+{"TunnelEffect", std::make_shared<TunnelEffect>(fixture.ctx())},
+```
+
+**3D Effects:** If requires Renderer3D, add to `requires_3d` check (line 148-151)
+
+### 7. Build and Test
+
+```bash
+# Full build
+cmake --build build -j4
+
+# Run effect tests
+cmake -S . -B build -DDEMO_BUILD_TESTS=ON
+cmake --build build -j4 --target test_demo_effects
+cd build && ./test_demo_effects
+
+# Run all tests
+cd build && ctest
+```
+
+### 8. Verify
+
+**Checklist:**
+- [ ] Effect compiles without errors
+- [ ] Effect appears in timeline
+- [ ] test_demo_effects passes
+- [ ] Effect renders correctly: `./build/demo64k`
+- [ ] No shader compilation errors
+- [ ] Follows naming conventions
+
+---
+
+## Common Issues
+
+### Build Error: "no member named 'ASSET_..._SHADER'"
+
+**Cause:** Shader not in assets.txt or wrong asset ID name
+
+**Fix:**
+1. Check `workspaces/main/assets.txt` has shader entry
+2. Asset ID is `ASSET_` + uppercase entry name (e.g., `SHADER_TUNNEL` → `ASSET_SHADER_TUNNEL`)
+
+### Build Error: "undefined symbol for architecture"
+
+**Cause:** Effect not in CMakeLists.txt GPU_SOURCES
+
+**Fix:** Add `.cc` file to BOTH sections (headless and normal mode)
+
+### Timeline Parse Error: "Expected '+', '=', or '-'"
+
+**Cause:** Missing priority modifier after EFFECT keyword
+
+**Fix:** Use `EFFECT +`, `EFFECT =`, or `EFFECT -` (never just `EFFECT`)
+
+### Test Failure: Effect not in test list
+
+**Cause:** Effect not added to test_demo_effects.cc
+
+**Fix:** Add to `post_process_effects` or `scene_effects` list
+
+---
+
+## Automation Script Example
+
+```bash
+#!/bin/bash
+# Example automation for AI agents
+
+EFFECT_NAME="$1"  # CamelCase (e.g., "Tunnel")
+SNAKE_NAME=$(echo "$EFFECT_NAME" | sed 's/\([A-Z]\)/_\L\1/g' | sed 's/^_//')
+UPPER_NAME=$(echo "$SNAKE_NAME" | tr '[:lower:]' '[:upper:]')
+
+echo "Creating effect: $EFFECT_NAME"
+echo "  Snake case: $SNAKE_NAME"
+echo "  Upper case: $UPPER_NAME"
+
+# 1. Generate files (if using ShaderToy)
+# ./tools/shadertoy/convert_shadertoy.py shader.txt "$EFFECT_NAME"
+
+# 2. Add to assets.txt
+echo "SHADER_${UPPER_NAME}, NONE, shaders/${SNAKE_NAME}.wgsl, \"${EFFECT_NAME} effect\"" \
+    >> workspaces/main/assets.txt
+
+# 3. Add to CMakeLists.txt (both sections)
+# Use Edit tool to add to both GPU_SOURCES sections
+
+# 4. Add include to demo_effects.h
+# Use Edit tool to add #include line
+
+# 5. Add to timeline.seq
+# Use Edit tool to add EFFECT line with priority modifier
+
+# 6. Add to test file
+# Use Edit tool to add to appropriate test list
+
+# 7. Build
+cmake --build build -j4
+```
+
+---
+
+## See Also
+
+- `tools/shadertoy/README.md` - ShaderToy conversion guide
+- `doc/SEQUENCE.md` - Timeline format documentation
+- `doc/CONTRIBUTING.md` - General contribution guidelines
+- `src/gpu/effects/` - Existing effect examples
diff --git a/doc/HOWTO.md b/doc/HOWTO.md
index bdc0214..5ea6afd 100644
--- a/doc/HOWTO.md
+++ b/doc/HOWTO.md
@@ -86,12 +86,34 @@ make run_util_tests     # Utility tests
 
 ---
 
+## Training
+
+```bash
+./training/train_cnn.py --layers 3 --kernel_sizes 3,5,3 --epochs 10000 --batch_size 8 --input training/input/ --target training/output/ --checkpoint-every 1000
+```
+
+Generate shaders from checkpoint:
+```bash
+./training/train_cnn.py --export-only training/checkpoints/checkpoint_epoch_7000.pth
+```
+
+Generate ground truth (for shader validation):
+```bash
+./training/train_cnn.py --infer input.png --export-only checkpoints/checkpoint_epoch_7000.pth --output ground_truth.png
+```
+
+**Note:** Kernel sizes must match shader functions:
+- 3×3 kernel → `cnn_conv3x3_7to4` (36 weights: 9 pos × 4 channels)
+- 5×5 kernel → `cnn_conv5x5_7to4` (100 weights: 25 pos × 4 channels)
+
+---
+
 ## Timeline
 
 Edit `workspaces/main/timeline.seq`:
 ```text
 SEQUENCE 0.0 0
-  EFFECT HeptagonEffect 0.0 60.0 0
+  EFFECT + HeptagonEffect 0.0 60.0 0
 ```
 Rebuild to apply. See `doc/SEQUENCE.md`.
 
diff --git a/doc/RECIPE.md b/doc/RECIPE.md
index 6404391..d563027 100644
--- a/doc/RECIPE.md
+++ b/doc/RECIPE.md
@@ -157,8 +157,8 @@ void MyEffect::render(WGPUTextureView prev, WGPUTextureView target,
 
 **.seq syntax:**
 ```
-EFFECT MyEffect 0.0 10.0 strength=0.5 speed=3.0
-EFFECT MyEffect 10.0 20.0 strength=2.0  # speed keeps previous value
+EFFECT + MyEffect 0.0 10.0 strength=0.5 speed=3.0
+EFFECT = MyEffect 10.0 20.0 strength=2.0  # speed keeps previous value
 ```
 
 **Example:** `src/gpu/effects/flash_effect.cc`, `src/gpu/effects/chroma_aberration_effect.cc`
diff --git a/src/gpu/bind_group_builder.h b/src/gpu/bind_group_builder.h
new file mode 100644
index 0000000..d63f6e2
--- /dev/null
+++ b/src/gpu/bind_group_builder.h
@@ -0,0 +1,111 @@
+// WGPU bind group builder - reduces boilerplate for bind group creation
+#pragma once
+#include <vector>
+
+// Forward declarations (users must include gpu.h)
+struct WGPUBindGroupLayoutEntry;
+struct WGPUBindGroupEntry;
+struct WGPUDeviceImpl;
+typedef struct WGPUDeviceImpl* WGPUDevice;
+struct WGPUBindGroupLayoutImpl;
+typedef struct WGPUBindGroupLayoutImpl* WGPUBindGroupLayout;
+struct WGPUBindGroupImpl;
+typedef struct WGPUBindGroupImpl* WGPUBindGroup;
+struct WGPUBufferImpl;
+typedef struct WGPUBufferImpl* WGPUBuffer;
+struct WGPUTextureViewImpl;
+typedef struct WGPUTextureViewImpl* WGPUTextureView;
+struct WGPUSamplerImpl;
+typedef struct WGPUSamplerImpl* WGPUSampler;
+typedef uint32_t WGPUShaderStageFlags;
+
+#include "platform/platform.h"
+
+class BindGroupLayoutBuilder {
+  std::vector<WGPUBindGroupLayoutEntry> entries_;
+
+public:
+  BindGroupLayoutBuilder& uniform(uint32_t binding, WGPUShaderStageFlags vis, size_t min_size = 0) {
+    WGPUBindGroupLayoutEntry e{};
+    e.binding = binding;
+    e.visibility = vis;
+    e.buffer.type = WGPUBufferBindingType_Uniform;
+    if (min_size) e.buffer.minBindingSize = min_size;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  BindGroupLayoutBuilder& storage(uint32_t binding, WGPUShaderStageFlags vis, size_t min_size = 0) {
+    WGPUBindGroupLayoutEntry e{};
+    e.binding = binding;
+    e.visibility = vis;
+    e.buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+    if (min_size) e.buffer.minBindingSize = min_size;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  BindGroupLayoutBuilder& texture(uint32_t binding, WGPUShaderStageFlags vis) {
+    WGPUBindGroupLayoutEntry e{};
+    e.binding = binding;
+    e.visibility = vis;
+    e.texture.sampleType = WGPUTextureSampleType_Float;
+    e.texture.viewDimension = WGPUTextureViewDimension_2D;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  BindGroupLayoutBuilder& sampler(uint32_t binding, WGPUShaderStageFlags vis) {
+    WGPUBindGroupLayoutEntry e{};
+    e.binding = binding;
+    e.visibility = vis;
+    e.sampler.type = WGPUSamplerBindingType_Filtering;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  WGPUBindGroupLayout build(WGPUDevice device) {
+    WGPUBindGroupLayoutDescriptor desc{};
+    desc.entryCount = entries_.size();
+    desc.entries = entries_.data();
+    return wgpuDeviceCreateBindGroupLayout(device, &desc);
+  }
+};
+
+class BindGroupBuilder {
+  std::vector<WGPUBindGroupEntry> entries_;
+
+public:
+  BindGroupBuilder& buffer(uint32_t binding, WGPUBuffer buf, size_t size) {
+    WGPUBindGroupEntry e{};
+    e.binding = binding;
+    e.buffer = buf;
+    e.size = size;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  BindGroupBuilder& texture(uint32_t binding, WGPUTextureView view) {
+    WGPUBindGroupEntry e{};
+    e.binding = binding;
+    e.textureView = view;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  BindGroupBuilder& sampler(uint32_t binding, WGPUSampler samp) {
+    WGPUBindGroupEntry e{};
+    e.binding = binding;
+    e.sampler = samp;
+    entries_.push_back(e);
+    return *this;
+  }
+
+  WGPUBindGroup build(WGPUDevice device, WGPUBindGroupLayout layout) {
+    WGPUBindGroupDescriptor desc{};
+    desc.layout = layout;
+    desc.entryCount = entries_.size();
+    desc.entries = entries_.data();
+    return wgpuDeviceCreateBindGroup(device, &desc);
+  }
+};
diff --git a/src/gpu/demo_effects.h b/src/gpu/demo_effects.h
index 72b3f65..1ccf930 100644
--- a/src/gpu/demo_effects.h
+++ b/src/gpu/demo_effects.h
@@ -15,6 +15,7 @@
 #include "gpu/effects/theme_modulation_effect.h" // ThemeModulationEffect with full definition
 #include "gpu/effects/hybrid_3d_effect.h"
 #include "gpu/effects/flash_cube_effect.h"
+#include "gpu/effects/scene1_effect.h"
 #include "gpu/gpu.h"
 #include "gpu/texture_manager.h"
 #include "gpu/uniform_helper.h"
diff --git a/src/gpu/effects/cnn_effect.cc b/src/gpu/effects/cnn_effect.cc
index 7107bea..d74187c 100644
--- a/src/gpu/effects/cnn_effect.cc
+++ b/src/gpu/effects/cnn_effect.cc
@@ -6,70 +6,30 @@
 #include "gpu/effects/shaders.h"
 #include "gpu/effects/shader_composer.h"
 #include "gpu/effect.h"
+#include "gpu/bind_group_builder.h"
+#include "gpu/sampler_cache.h"
+#include "gpu/pipeline_builder.h"
 
 // Create custom pipeline with 5 bindings (includes original texture)
 static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
                                                WGPUTextureFormat format,
                                                const char* shader_code) {
-  std::string composed_shader = ShaderComposer::Get().Compose({}, shader_code);
+  WGPUBindGroupLayout bgl = BindGroupLayoutBuilder()
+    .sampler(0, WGPUShaderStage_Fragment)
+    .texture(1, WGPUShaderStage_Fragment)
+    .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment)
+    .uniform(3, WGPUShaderStage_Fragment)
+    .texture(4, WGPUShaderStage_Fragment)
+    .build(device);
 
-  WGPUShaderModuleDescriptor shader_desc = {};
-  WGPUShaderSourceWGSL wgsl_src = {};
-  wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_src.code = str_view(composed_shader.c_str());
-  shader_desc.nextInChain = &wgsl_src.chain;
-  WGPUShaderModule shader_module =
-      wgpuDeviceCreateShaderModule(device, &shader_desc);
+  WGPURenderPipeline pipeline = RenderPipelineBuilder(device)
+    .shader(shader_code)
+    .bind_group_layout(bgl)
+    .format(format)
+    .build();
 
-  WGPUBindGroupLayoutEntry bgl_entries[5] = {};
-  bgl_entries[0].binding = 0; // sampler
-  bgl_entries[0].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[0].sampler.type = WGPUSamplerBindingType_Filtering;
-  bgl_entries[1].binding = 1; // input texture
-  bgl_entries[1].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float;
-  bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
-  bgl_entries[2].binding = 2; // uniforms
-  bgl_entries[2].visibility = WGPUShaderStage_Vertex | WGPUShaderStage_Fragment;
-  bgl_entries[2].buffer.type = WGPUBufferBindingType_Uniform;
-  bgl_entries[3].binding = 3; // effect params
-  bgl_entries[3].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[3].buffer.type = WGPUBufferBindingType_Uniform;
-  bgl_entries[4].binding = 4; // original texture
-  bgl_entries[4].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[4].texture.sampleType = WGPUTextureSampleType_Float;
-  bgl_entries[4].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  WGPUBindGroupLayoutDescriptor bgl_desc = {};
-  bgl_desc.entryCount = 5;
-  bgl_desc.entries = bgl_entries;
-  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
-
-  WGPUPipelineLayoutDescriptor pl_desc = {};
-  pl_desc.bindGroupLayoutCount = 1;
-  pl_desc.bindGroupLayouts = &bgl;
-  WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(device, &pl_desc);
-
-  WGPUColorTargetState color_target = {};
-  color_target.format = format;
-  color_target.writeMask = WGPUColorWriteMask_All;
-
-  WGPUFragmentState fragment_state = {};
-  fragment_state.module = shader_module;
-  fragment_state.entryPoint = str_view("fs_main");
-  fragment_state.targetCount = 1;
-  fragment_state.targets = &color_target;
-
-  WGPURenderPipelineDescriptor pipeline_desc = {};
-  pipeline_desc.layout = pl;
-  pipeline_desc.vertex.module = shader_module;
-  pipeline_desc.vertex.entryPoint = str_view("vs_main");
-  pipeline_desc.fragment = &fragment_state;
-  pipeline_desc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
-  pipeline_desc.multisample.count = 1;
-  pipeline_desc.multisample.mask = 0xFFFFFFFF;
-
-  return wgpuDeviceCreateRenderPipeline(device, &pipeline_desc);
+  wgpuBindGroupLayoutRelease(bgl);
+  return pipeline;
 }
 
 CNNEffect::CNNEffect(const GpuContext& ctx)
@@ -137,29 +97,13 @@ void CNNEffect::update_bind_group(WGPUTextureView input_view) {
     wgpuBindGroupRelease(bind_group_);
 
   WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_, 0);
-  WGPUSamplerDescriptor sd = {};
-  sd.magFilter = WGPUFilterMode_Linear;
-  sd.minFilter = WGPUFilterMode_Linear;
-  sd.maxAnisotropy = 1;
-  WGPUSampler sampler = wgpuDeviceCreateSampler(ctx_.device, &sd);
-
-  WGPUBindGroupEntry bge[5] = {};
-  bge[0].binding = 0;
-  bge[0].sampler = sampler;
-  bge[1].binding = 1;
-  bge[1].textureView = input_view_;
-  bge[2].binding = 2;
-  bge[2].buffer = uniforms_.get().buffer;
-  bge[2].size = uniforms_.get().size;
-  bge[3].binding = 3;
-  bge[3].buffer = params_buffer_.get().buffer;
-  bge[3].size = params_buffer_.get().size;
-  bge[4].binding = 4;
-  bge[4].textureView = original_view_ ? original_view_ : input_view_;
+  WGPUSampler sampler = SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::linear());
 
-  WGPUBindGroupDescriptor bgd = {};
-  bgd.layout = bgl;
-  bgd.entryCount = 5;
-  bgd.entries = bge;
-  bind_group_ = wgpuDeviceCreateBindGroup(ctx_.device, &bgd);
+  bind_group_ = BindGroupBuilder()
+    .sampler(0, sampler)
+    .texture(1, input_view_)
+    .buffer(2, uniforms_.get().buffer, uniforms_.get().size)
+    .buffer(3, params_buffer_.get().buffer, params_buffer_.get().size)
+    .texture(4, original_view_ ? original_view_ : input_view_)
+    .build(ctx_.device, bgl);
 }
diff --git a/src/gpu/effects/post_process_helper.cc b/src/gpu/effects/post_process_helper.cc
index e99467f..0c339c7 100644
--- a/src/gpu/effects/post_process_helper.cc
+++ b/src/gpu/effects/post_process_helper.cc
@@ -5,69 +5,30 @@
 #include "../demo_effects.h"
 #include "gpu/gpu.h"
 #include "gpu/effects/shader_composer.h"
+#include "gpu/bind_group_builder.h"
+#include "gpu/sampler_cache.h"
+#include "gpu/pipeline_builder.h"
 #include <cstring>
 
 // Helper to create a standard post-processing pipeline
 WGPURenderPipeline create_post_process_pipeline(WGPUDevice device,
                                                 WGPUTextureFormat format,
                                                 const char* shader_code) {
-  std::string composed_shader = ShaderComposer::Get().Compose({}, shader_code);
+  WGPUBindGroupLayout bgl = BindGroupLayoutBuilder()
+    .sampler(PP_BINDING_SAMPLER, WGPUShaderStage_Fragment)
+    .texture(PP_BINDING_TEXTURE, WGPUShaderStage_Fragment)
+    .uniform(PP_BINDING_UNIFORMS, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment)
+    .uniform(PP_BINDING_EFFECT_PARAMS, WGPUShaderStage_Fragment)
+    .build(device);
 
-  WGPUShaderModuleDescriptor shader_desc = {};
-  WGPUShaderSourceWGSL wgsl_src = {};
-  wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_src.code = str_view(composed_shader.c_str());
-  shader_desc.nextInChain = &wgsl_src.chain;
-  WGPUShaderModule shader_module =
-      wgpuDeviceCreateShaderModule(device, &shader_desc);
+  WGPURenderPipeline pipeline = RenderPipelineBuilder(device)
+    .shader(shader_code)
+    .bind_group_layout(bgl)
+    .format(format)
+    .build();
 
-  WGPUBindGroupLayoutEntry bgl_entries[4] = {};
-  bgl_entries[0].binding = PP_BINDING_SAMPLER;
-  bgl_entries[0].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[0].sampler.type = WGPUSamplerBindingType_Filtering;
-  bgl_entries[1].binding = PP_BINDING_TEXTURE;
-  bgl_entries[1].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float;
-  bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
-  bgl_entries[2].binding = PP_BINDING_UNIFORMS;
-  bgl_entries[2].visibility = WGPUShaderStage_Vertex | WGPUShaderStage_Fragment;
-  bgl_entries[2].buffer.type = WGPUBufferBindingType_Uniform;
-
-  // Add an entry for effect-specific parameters
-  bgl_entries[3].binding = PP_BINDING_EFFECT_PARAMS;
-  bgl_entries[3].visibility = WGPUShaderStage_Fragment;
-  bgl_entries[3].buffer.type = WGPUBufferBindingType_Uniform;
-
-  WGPUBindGroupLayoutDescriptor bgl_desc = {};
-  bgl_desc.entryCount = 4;
-  bgl_desc.entries = bgl_entries;
-  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
-
-  WGPUPipelineLayoutDescriptor pl_desc = {};
-  pl_desc.bindGroupLayoutCount = 1;
-  pl_desc.bindGroupLayouts = &bgl;
-  WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(device, &pl_desc);
-
-  WGPUColorTargetState color_target = {};
-  color_target.format = format;
-  color_target.writeMask = WGPUColorWriteMask_All;
-
-  WGPUFragmentState fragment_state = {};
-  fragment_state.module = shader_module;
-  fragment_state.entryPoint = str_view("fs_main");
-  fragment_state.targetCount = 1;
-  fragment_state.targets = &color_target;
-
-  WGPURenderPipelineDescriptor pipeline_desc = {};
-  pipeline_desc.layout = pl;
-  pipeline_desc.vertex.module = shader_module;
-  pipeline_desc.vertex.entryPoint = str_view("vs_main");
-  pipeline_desc.fragment = &fragment_state;
-  pipeline_desc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
-  pipeline_desc.multisample.count = 1;
-  pipeline_desc.multisample.mask = 0xFFFFFFFF;
-
-  return wgpuDeviceCreateRenderPipeline(device, &pipeline_desc);
+  wgpuBindGroupLayoutRelease(bgl);
+  return pipeline;
 }
 
 // --- PostProcess Implementation Helper ---
@@ -82,25 +43,16 @@ void pp_update_bind_group(WGPUDevice device, WGPURenderPipeline pipeline,
 
   if (*bind_group)
     wgpuBindGroupRelease(*bind_group);
+
   WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline, 0);
-  WGPUSamplerDescriptor sd = {};
-  sd.magFilter = WGPUFilterMode_Linear;
-  sd.minFilter = WGPUFilterMode_Linear;
-  sd.maxAnisotropy = 1;
-  WGPUSampler sampler = wgpuDeviceCreateSampler(device, &sd);
-  WGPUBindGroupEntry bge[4] = {};
-  bge[0].binding = PP_BINDING_SAMPLER;
-  bge[0].sampler = sampler;
-  bge[1].binding = PP_BINDING_TEXTURE;
-  bge[1].textureView = input_view;
-  bge[2].binding = PP_BINDING_UNIFORMS;
-  bge[2].buffer = uniforms.buffer;
-  bge[2].size = uniforms.size;
-  bge[3].binding = PP_BINDING_EFFECT_PARAMS;
-  bge[3].buffer =
-      effect_params.buffer ? effect_params.buffer : g_dummy_buffer.buffer;
-  bge[3].size = effect_params.buffer ? effect_params.size : g_dummy_buffer.size;
-  WGPUBindGroupDescriptor bgd = {
-      .layout = bgl, .entryCount = 4, .entries = bge};
-  *bind_group = wgpuDeviceCreateBindGroup(device, &bgd);
+  WGPUSampler sampler = SamplerCache::Get().get_or_create(device, SamplerCache::linear());
+
+  *bind_group = BindGroupBuilder()
+    .sampler(PP_BINDING_SAMPLER, sampler)
+    .texture(PP_BINDING_TEXTURE, input_view)
+    .buffer(PP_BINDING_UNIFORMS, uniforms.buffer, uniforms.size)
+    .buffer(PP_BINDING_EFFECT_PARAMS,
+            effect_params.buffer ? effect_params.buffer : g_dummy_buffer.buffer,
+            effect_params.buffer ? effect_params.size : g_dummy_buffer.size)
+    .build(device, bgl);
 }
diff --git a/src/gpu/effects/rotating_cube_effect.cc b/src/gpu/effects/rotating_cube_effect.cc
index 8d1f05a..da973e5 100644
--- a/src/gpu/effects/rotating_cube_effect.cc
+++ b/src/gpu/effects/rotating_cube_effect.cc
@@ -5,16 +5,14 @@
 #include "gpu/effects/rotating_cube_effect.h"
 #include "generated/assets.h"
 #include "gpu/effects/shader_composer.h"
+#include "gpu/sampler_cache.h"
 #include "util/asset_manager_utils.h"
 
 RotatingCubeEffect::RotatingCubeEffect(const GpuContext& ctx) : Effect(ctx) {
 }
 
 RotatingCubeEffect::~RotatingCubeEffect() {
-  if (mask_sampler_)
-    wgpuSamplerRelease(mask_sampler_);
-  if (noise_sampler_)
-    wgpuSamplerRelease(noise_sampler_);
+  // Samplers owned by SamplerCache - don't release
   if (noise_view_)
     wgpuTextureViewRelease(noise_view_);
   if (noise_texture_)
@@ -49,21 +47,8 @@ void RotatingCubeEffect::init(MainSequence* demo) {
   noise_texture_ = wgpuDeviceCreateTexture(ctx_.device, &tex_desc);
   noise_view_ = wgpuTextureCreateView(noise_texture_, nullptr);
 
-  WGPUSamplerDescriptor sampler_desc = {};
-  sampler_desc.addressModeU = WGPUAddressMode_Repeat;
-  sampler_desc.addressModeV = WGPUAddressMode_Repeat;
-  sampler_desc.magFilter = WGPUFilterMode_Linear;
-  sampler_desc.minFilter = WGPUFilterMode_Linear;
-  sampler_desc.maxAnisotropy = 1;
-  noise_sampler_ = wgpuDeviceCreateSampler(ctx_.device, &sampler_desc);
-
-  WGPUSamplerDescriptor mask_sampler_desc = {};
-  mask_sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge;
-  mask_sampler_desc.addressModeV = WGPUAddressMode_ClampToEdge;
-  mask_sampler_desc.magFilter = WGPUFilterMode_Linear;
-  mask_sampler_desc.minFilter = WGPUFilterMode_Linear;
-  mask_sampler_desc.maxAnisotropy = 1;
-  mask_sampler_ = wgpuDeviceCreateSampler(ctx_.device, &mask_sampler_desc);
+  noise_sampler_ = SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::linear());
+  mask_sampler_ = SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::clamp());
 
   size_t shader_size;
   const char* shader_code =
diff --git a/src/gpu/effects/scene1_effect.cc b/src/gpu/effects/scene1_effect.cc
new file mode 100644
index 0000000..a6733b7
--- /dev/null
+++ b/src/gpu/effects/scene1_effect.cc
@@ -0,0 +1,28 @@
+// This file is part of the 64k demo project.
+// Scene1 effect - ShaderToy conversion (raymarching scene)
+
+#include "gpu/demo_effects.h"
+#include "gpu/gpu.h"
+
+Scene1Effect::Scene1Effect(const GpuContext& ctx) : Effect(ctx) {
+  ResourceBinding bindings[] = {{uniforms_.get(), WGPUBufferBindingType_Uniform}};
+  pass_ = gpu_create_render_pass(ctx_.device, ctx_.format, scene1_shader_wgsl,
+                                 bindings, 1);
+  pass_.vertex_count = 3;
+}
+
+void Scene1Effect::render(WGPURenderPassEncoder pass, float t, float b,
+                          float i, float a) {
+  CommonPostProcessUniforms u = {
+      .resolution = {(float)width_, (float)height_},
+      ._pad = {0.0f, 0.0f},
+      .aspect_ratio = a,
+      .time = t,
+      .beat = b,
+      .audio_intensity = i,
+  };
+  uniforms_.update(ctx_.queue, u);
+  wgpuRenderPassEncoderSetPipeline(pass, pass_.pipeline);
+  wgpuRenderPassEncoderSetBindGroup(pass, 0, pass_.bind_group, 0, nullptr);
+  wgpuRenderPassEncoderDraw(pass, pass_.vertex_count, 1, 0, 0);
+}
diff --git a/src/gpu/effects/scene1_effect.h b/src/gpu/effects/scene1_effect.h
new file mode 100644
index 0000000..dc5c747
--- /dev/null
+++ b/src/gpu/effects/scene1_effect.h
@@ -0,0 +1,19 @@
+// This file is part of the 64k demo project.
+// Scene1 effect - ShaderToy conversion (raymarching scene)
+
+#ifndef SCENE1_EFFECT_H_
+#define SCENE1_EFFECT_H_
+
+#include "gpu/effect.h"
+
+class Scene1Effect : public Effect {
+ public:
+  Scene1Effect(const GpuContext& ctx);
+  void render(WGPURenderPassEncoder pass, float time, float beat,
+              float intensity, float aspect_ratio) override;
+
+ private:
+  RenderPass pass_;
+};
+
+#endif /* SCENE1_EFFECT_H_ */
diff --git a/src/gpu/effects/shaders.cc b/src/gpu/effects/shaders.cc
index 6559bf5..5f78298 100644
--- a/src/gpu/effects/shaders.cc
+++ b/src/gpu/effects/shaders.cc
@@ -98,6 +98,10 @@ const char* solarize_shader_wgsl =
 
     SafeGetAsset(AssetId::ASSET_SHADER_SOLARIZE);
 
+const char* scene1_shader_wgsl =
+
+    SafeGetAsset(AssetId::ASSET_SHADER_SCENE1);
+
 const char* distort_shader_wgsl =
 
     SafeGetAsset(AssetId::ASSET_SHADER_DISTORT);
diff --git a/src/gpu/effects/shaders.h b/src/gpu/effects/shaders.h
index 7acc2a6..03fa48c 100644
--- a/src/gpu/effects/shaders.h
+++ b/src/gpu/effects/shaders.h
@@ -15,6 +15,7 @@ extern const char* ellipse_shader_wgsl;
 extern const char* particle_spray_compute_wgsl;
 extern const char* gaussian_blur_shader_wgsl;
 extern const char* solarize_shader_wgsl;
+extern const char* scene1_shader_wgsl;
 extern const char* distort_shader_wgsl;
 extern const char* chroma_aberration_shader_wgsl;
 extern const char* vignette_shader_wgsl;
diff --git a/src/gpu/pipeline_builder.h b/src/gpu/pipeline_builder.h
new file mode 100644
index 0000000..06b4ceb
--- /dev/null
+++ b/src/gpu/pipeline_builder.h
@@ -0,0 +1,109 @@
+// WGPU render pipeline builder - reduces pipeline creation boilerplate
+#pragma once
+#include <vector>
+#include <string>
+
+// Forward declarations (users must include gpu.h and shader_composer.h)
+struct WGPUDeviceImpl;
+typedef struct WGPUDeviceImpl* WGPUDevice;
+struct WGPUBindGroupLayoutImpl;
+typedef struct WGPUBindGroupLayoutImpl* WGPUBindGroupLayout;
+struct WGPURenderPipelineImpl;
+typedef struct WGPURenderPipelineImpl* WGPURenderPipeline;
+struct WGPUShaderModuleImpl;
+typedef struct WGPUShaderModuleImpl* WGPUShaderModule;
+
+#include "platform/platform.h"
+#include "gpu/effects/shader_composer.h"
+
+class RenderPipelineBuilder {
+  WGPUDevice device_;
+  WGPURenderPipelineDescriptor desc_{};
+  WGPUColorTargetState color_{};
+  WGPUBlendState blend_{};
+  WGPUDepthStencilState depth_{};
+  std::vector<WGPUBindGroupLayout> layouts_;
+  std::string shader_text_;
+  WGPUShaderModule shader_module_ = nullptr;
+  bool has_blend_ = false;
+  bool has_depth_ = false;
+
+public:
+  explicit RenderPipelineBuilder(WGPUDevice device) : device_(device) {
+    desc_.primitive.topology = WGPUPrimitiveTopology_TriangleList;
+    desc_.primitive.cullMode = WGPUCullMode_None;
+    desc_.multisample.count = 1;
+    desc_.multisample.mask = 0xFFFFFFFF;
+  }
+
+  RenderPipelineBuilder& shader(const char* wgsl, bool compose = true) {
+    shader_text_ = compose ? ShaderComposer::Get().Compose({}, wgsl) : wgsl;
+    WGPUShaderSourceWGSL wgsl_src{};
+    wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
+    wgsl_src.code = str_view(shader_text_.c_str());
+    WGPUShaderModuleDescriptor shader_desc{};
+    shader_desc.nextInChain = &wgsl_src.chain;
+    shader_module_ = wgpuDeviceCreateShaderModule(device_, &shader_desc);
+    desc_.vertex.module = shader_module_;
+    desc_.vertex.entryPoint = str_view("vs_main");
+    return *this;
+  }
+
+  RenderPipelineBuilder& bind_group_layout(WGPUBindGroupLayout layout) {
+    layouts_.push_back(layout);
+    return *this;
+  }
+
+  RenderPipelineBuilder& format(WGPUTextureFormat fmt) {
+    color_.format = fmt;
+    return *this;
+  }
+
+  RenderPipelineBuilder& blend_alpha() {
+    has_blend_ = true;
+    blend_.color.operation = WGPUBlendOperation_Add;
+    blend_.color.srcFactor = WGPUBlendFactor_SrcAlpha;
+    blend_.color.dstFactor = WGPUBlendFactor_OneMinusSrcAlpha;
+    blend_.alpha.operation = WGPUBlendOperation_Add;
+    blend_.alpha.srcFactor = WGPUBlendFactor_One;
+    blend_.alpha.dstFactor = WGPUBlendFactor_OneMinusSrcAlpha;
+    return *this;
+  }
+
+  RenderPipelineBuilder& depth(WGPUTextureFormat depth_fmt = WGPUTextureFormat_Depth24Plus) {
+    has_depth_ = true;
+    depth_.format = depth_fmt;
+    depth_.depthWriteEnabled = WGPUOptionalBool_True;
+    depth_.depthCompare = WGPUCompareFunction_Less;
+    return *this;
+  }
+
+  RenderPipelineBuilder& cull_back() {
+    desc_.primitive.cullMode = WGPUCullMode_Back;
+    return *this;
+  }
+
+  WGPURenderPipeline build() {
+    color_.writeMask = WGPUColorWriteMask_All;
+    if (has_blend_) color_.blend = &blend_;
+
+    WGPUFragmentState fragment{};
+    fragment.module = shader_module_;
+    fragment.entryPoint = str_view("fs_main");
+    fragment.targetCount = 1;
+    fragment.targets = &color_;
+
+    WGPUPipelineLayoutDescriptor pl_desc{};
+    pl_desc.bindGroupLayoutCount = layouts_.size();
+    pl_desc.bindGroupLayouts = layouts_.data();
+    WGPUPipelineLayout layout = wgpuDeviceCreatePipelineLayout(device_, &pl_desc);
+
+    desc_.layout = layout;
+    desc_.fragment = &fragment;
+    if (has_depth_) desc_.depthStencil = &depth_;
+
+    WGPURenderPipeline pipeline = wgpuDeviceCreateRenderPipeline(device_, &desc_);
+    wgpuPipelineLayoutRelease(layout);
+    return pipeline;
+  }
+};
diff --git a/src/gpu/sampler_cache.h b/src/gpu/sampler_cache.h
new file mode 100644
index 0000000..0f012a8
--- /dev/null
+++ b/src/gpu/sampler_cache.h
@@ -0,0 +1,61 @@
+// Sampler cache - deduplicates samplers across effects
+#pragma once
+#include <map>
+
+// Forward declarations (users must include gpu.h)
+struct WGPUDeviceImpl;
+typedef struct WGPUDeviceImpl* WGPUDevice;
+struct WGPUSamplerImpl;
+typedef struct WGPUSamplerImpl* WGPUSampler;
+
+#include "platform/platform.h"
+
+struct SamplerSpec {
+  WGPUAddressMode u, v;
+  WGPUFilterMode mag, min;
+  uint16_t anisotropy;
+
+  bool operator<(const SamplerSpec& o) const {
+    if (u != o.u) return u < o.u;
+    if (v != o.v) return v < o.v;
+    if (mag != o.mag) return mag < o.mag;
+    if (min != o.min) return min < o.min;
+    return anisotropy < o.anisotropy;
+  }
+};
+
+class SamplerCache {
+  std::map<SamplerSpec, WGPUSampler> cache_;
+  SamplerCache() = default;
+
+public:
+  static SamplerCache& Get() {
+    static SamplerCache instance;
+    return instance;
+  }
+
+  WGPUSampler get_or_create(WGPUDevice device, const SamplerSpec& spec) {
+    auto it = cache_.find(spec);
+    if (it != cache_.end()) return it->second;
+
+    WGPUSamplerDescriptor desc{};
+    desc.addressModeU = spec.u;
+    desc.addressModeV = spec.v;
+    desc.magFilter = spec.mag;
+    desc.minFilter = spec.min;
+    desc.maxAnisotropy = spec.anisotropy;
+    WGPUSampler sampler = wgpuDeviceCreateSampler(device, &desc);
+    cache_[spec] = sampler;
+    return sampler;
+  }
+
+  // Common presets
+  static SamplerSpec linear() {
+    return {WGPUAddressMode_Repeat, WGPUAddressMode_Repeat,
+            WGPUFilterMode_Linear, WGPUFilterMode_Linear, 1};
+  }
+  static SamplerSpec clamp() {
+    return {WGPUAddressMode_ClampToEdge, WGPUAddressMode_ClampToEdge,
+            WGPUFilterMode_Linear, WGPUFilterMode_Linear, 1};
+  }
+};
diff --git a/src/tests/gpu/test_demo_effects.cc b/src/tests/gpu/test_demo_effects.cc
index 619b9c9..01e6678 100644
--- a/src/tests/gpu/test_demo_effects.cc
+++ b/src/tests/gpu/test_demo_effects.cc
@@ -134,6 +134,7 @@ static void test_scene_effects() {
       {"CircleMaskEffect", std::make_shared<CircleMaskEffect>(fixture.ctx())},
       {"RotatingCubeEffect",
        std::make_shared<RotatingCubeEffect>(fixture.ctx())},
+      {"Scene1Effect", std::make_shared<Scene1Effect>(fixture.ctx())},
   };
 
   int passed = 0;
diff --git a/tools/shadertoy/README.md b/tools/shadertoy/README.md
new file mode 100644
index 0000000..283a65f
--- /dev/null
+++ b/tools/shadertoy/README.md
@@ -0,0 +1,204 @@
+# ShaderToy Conversion Guide
+
+Quick guide to convert ShaderToy shaders to demo effects.
+
+**For complete workflow:** See `doc/EFFECT_WORKFLOW.md` for full integration checklist.
+
+## Quick Start (Automated)
+
+```bash
+# Save ShaderToy code to a file
+cat > tunnel.txt << 'EOF'
+void mainImage(out vec4 fragColor, in vec2 fragCoord) {
+    vec2 uv = fragCoord / iResolution.xy;
+    vec3 col = 0.5 + 0.5 * cos(iTime + uv.xyx + vec3(0,2,4));
+    fragColor = vec4(col, 1.0);
+}
+EOF
+
+# Generate effect files
+./tools/shadertoy/convert_shadertoy.py tunnel.txt Tunnel
+
+# Regenerate only shader (if .h/.cc already exist)
+./tools/shadertoy/convert_shadertoy.py tunnel.txt Tunnel --shader-only
+
+# Follow printed instructions to integrate
+```
+
+## Files
+
+**Automated Script:**
+- `convert_shadertoy.py` - Generates all files from ShaderToy code
+- `example.txt` - Example ShaderToy shader for testing
+
+**Manual Templates:**
+- `template.h` - Header boilerplate
+- `template.cc` - Implementation boilerplate
+- `template.wgsl` - Shader boilerplate with conversion notes
+
+## Manual Steps
+
+### 1. Copy Templates
+
+```bash
+# Choose effect name (e.g., "tunnel", "plasma", "warp")
+EFFECT_NAME="myeffect"
+
+cp tools/shadertoy/template.h src/gpu/effects/${EFFECT_NAME}_effect.h
+cp tools/shadertoy/template.cc src/gpu/effects/${EFFECT_NAME}_effect.cc
+cp tools/shadertoy/template.wgsl workspaces/main/shaders/${EFFECT_NAME}.wgsl
+```
+
+### 2. Rename Class
+
+In both `.h` and `.cc`:
+- `ShaderToyEffect` → `MyEffectEffect`
+- `SHADERTOY_EFFECT_H_` → `MYEFFECT_EFFECT_H_`
+- `shadertoy_effect.h` → `myeffect_effect.h`
+
+### 3. Convert Shader
+
+In `.wgsl`, paste ShaderToy `mainImage()` into `fs_main()`:
+
+**ShaderToy:**
+```glsl
+void mainImage(out vec4 fragColor, in vec2 fragCoord) {
+    vec2 uv = fragCoord / iResolution.xy;
+    fragColor = vec4(uv, 0.5, 1.0);
+}
+```
+
+**WGSL:**
+```wgsl
+@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
+    let uv = p.xy / uniforms.resolution;
+    return vec4<f32>(uv, 0.5, 1.0);
+}
+```
+
+### 4. Update Asset Name
+
+In `.cc`, update `AssetId::ASSET_SHADERTOY_SHADER` to match your shader filename:
+```cpp
+AssetId::ASSET_MYEFFECT_SHADER
+```
+
+### 5. Add to Assets
+
+In `workspaces/main/assets.txt`:
+```
+shaders/myeffect.wgsl
+```
+
+### 6. Register Effect
+
+In `src/gpu/demo_effects.h`:
+```cpp
+#include "gpu/effects/myeffect_effect.h"
+```
+
+In `workspaces/main/timeline.seq`:
+```
+SEQUENCE 0.0 0
+  EFFECT + MyEffectEffect 0.0 10.0
+```
+
+### 7. Update CMakeLists.txt
+
+Add effect source to `CMakeLists.txt` GPU_SOURCES (both headless and normal mode sections):
+```cmake
+src/gpu/effects/myeffect_effect.cc
+```
+
+### 8. Update Tests
+
+In `src/tests/gpu/test_demo_effects.cc`:
+- Add to `post_process_effects` list (lines 80-93) if it's a post-process effect
+- OR add to `scene_effects` list (lines 125-137) if it's a scene effect
+- Example: `{"MyEffectEffect", std::make_shared<MyEffectEffect>(fixture.ctx())},`
+
+### 9. Build & Test
+
+```bash
+cmake --build build -j4
+./build/demo64k
+
+# Run tests
+cmake -S . -B build -DDEMO_BUILD_TESTS=ON
+cmake --build build -j4
+cd build && ctest
+```
+
+## Example Conversion
+
+**Input ShaderToy:**
+```glsl
+void mainImage(out vec4 fragColor, in vec2 fragCoord) {
+    vec2 uv = fragCoord / iResolution.xy;
+    vec3 col = 0.5 + 0.5 * cos(iTime + uv.xyx + vec3(0,2,4));
+    fragColor = vec4(col, 1.0);
+}
+```
+
+**Generated WGSL (after script + manual fixes):**
+```wgsl
+@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
+    let uv = p.xy / uniforms.resolution;
+    let col = vec3<f32>(0.5) + 0.5 * cos(uniforms.time + uv.xyx + vec3<f32>(0.0, 2.0, 4.0));
+    return vec4<f32>(col, 1.0);
+}
+```
+
+## Common Conversions
+
+| ShaderToy | WGSL |
+|-----------|------|
+| `iResolution.xy` | `uniforms.resolution` |
+| `iTime` | `uniforms.time` |
+| `fragCoord` | `p.xy` |
+| `float` | `f32` |
+| `vec2` | `vec2<f32>` |
+| `mod(x, y)` | `x % y` |
+| `texture(iChannel0, uv)` | `textureSample(txt, smplr, uv)` |
+| `fragColor = ...` | `return ...` |
+| `vec2 p = ...` | `let p = vec2<f32>(...)` or `var p: vec2<f32> = ...` |
+
+## Custom Parameters
+
+For tunable values:
+
+**C++ (`.h`):**
+```cpp
+struct MyEffectParams {
+    float speed;
+    float scale;
+    float _pad[2];
+};
+static_assert(sizeof(MyEffectParams) == 16, "...");
+```
+
+**WGSL:**
+```wgsl
+struct MyEffectParams {
+    speed: f32,
+    scale: f32,
+    _pad0: f32,
+    _pad1: f32,
+}
+@group(0) @binding(3) var<uniform> params: MyEffectParams;
+```
+
+## Available Uniforms
+
+Always available in `uniforms: CommonUniforms`:
+- `resolution: vec2<f32>` - Screen resolution
+- `aspect_ratio: f32` - Width/height
+- `time: f32` - Demo time (seconds)
+- `beat: f32` - Music beat sync (0-1)
+- `audio_intensity: f32` - Audio reactivity
+
+## Next Steps
+
+- See `doc/CONTRIBUTING.md` for commit policy
+- See `doc/SEQUENCE.md` for timeline syntax
+- See existing effects in `src/gpu/effects/` for examples
diff --git a/tools/shadertoy/convert_shadertoy.py b/tools/shadertoy/convert_shadertoy.py
new file mode 100755
index 0000000..e85f384
--- /dev/null
+++ b/tools/shadertoy/convert_shadertoy.py
@@ -0,0 +1,399 @@
+#!/usr/bin/env python3
+# This file is part of the 64k demo project.
+# Converts ShaderToy shader to demo effect boilerplate.
+#
+# Usage:
+#   ./tools/shadertoy/convert_shadertoy.py <shader.txt> <EffectName>
+#
+# Example:
+#   ./tools/shadertoy/convert_shadertoy.py tunnel.txt Tunnel
+#   ./tools/shadertoy/convert_shadertoy.py tools/shadertoy/example.txt Rainbow
+#
+# Generates:
+#   - src/gpu/effects/<effect>_effect.h
+#   - src/gpu/effects/<effect>_effect.cc
+#   - workspaces/main/shaders/<effect>.wgsl
+#
+# The script performs basic ShaderToy→WGSL conversion:
+#   - Converts types (float→f32, vec2→vec2<f32>, etc.)
+#   - Converts uniforms (iTime→uniforms.time, etc.)
+#   - Extracts mainImage() body into fs_main()
+#   - Generates boilerplate C++ effect class
+#
+# Manual fixes usually needed:
+#   - fragColor assignments → return statements
+#   - Variable name conflicts (e.g., shadowing 'p')
+#   - Complex type inference
+#   - Texture channel mappings
+#   - Helper function signatures
+
+import sys
+import os
+import re
+from pathlib import Path
+
+def to_snake_case(name):
+    """Convert CamelCase to snake_case."""
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+def to_upper_snake_case(name):
+    """Convert CamelCase to UPPER_SNAKE_CASE."""
+    return to_snake_case(name).upper()
+
+def to_camel_case(name):
+    """Convert snake_case to CamelCase."""
+    return ''.join(word.capitalize() for word in name.split('_'))
+
+def convert_shadertoy_to_wgsl(shader_code):
+    """Basic ShaderToy to WGSL conversion."""
+    # Extract mainImage first
+    main_match = re.search(r'void\s+mainImage\s*\([^)]+\)\s*\{(.*)\}', shader_code, re.DOTALL)
+    if main_match:
+        main_body = main_match.group(1).strip()
+        helpers = shader_code[:main_match.start()]
+    else:
+        main_body = ""
+        helpers = shader_code
+
+    # Replace common ShaderToy defines
+    conversions = [
+        (r'#define\s+TIME\s+iTime', ''),
+        (r'#define\s+RESOLUTION\s+iResolution', ''),
+        (r'#define\s+PI\s+[\d.]+', 'const PI: f32 = 3.141592654;'),
+        (r'#define\s+TAU\s+\([^)]+\)', 'const TAU: f32 = 6.283185307;'),
+        (r'#define\s+ROT\(a\)\s+mat2\([^)]+\)', ''),  # Will be converted to function
+
+        # Common ShaderToy uniforms
+        (r'\bTIME\b', 'uniforms.time'),
+        (r'\biTime\b', 'uniforms.time'),
+        (r'\bRESOLUTION\b', 'uniforms.resolution'),
+        (r'\biResolution\b', 'uniforms.resolution'),
+        (r'\bfragCoord\b', 'p.xy'),
+
+        # Type conversions
+        (r'\bfloat\b', 'f32'),
+        (r'\bvec2\b', 'vec2<f32>'),
+        (r'\bvec3\b', 'vec3<f32>'),
+        (r'\bvec4\b', 'vec4<f32>'),
+        (r'\bmat2\b', 'mat2x2<f32>'),
+        (r'\bmat3\b', 'mat3x3<f32>'),
+        (r'\bmat4\b', 'mat4x4<f32>'),
+
+        # Function declarations (preserve return type context)
+        (r'\bf32\s+(\w+)\s*\(', r'fn \1('),
+        (r'\bvec2<f32>\s+(\w+)\s*\(', r'fn \1('),
+        (r'\bvec3<f32>\s+(\w+)\s*\(', r'fn \1('),
+        (r'\bvec4<f32>\s+(\w+)\s*\(', r'fn \1('),
+        (r'\bvoid\s+(\w+)\s*\(', r'fn \1('),
+
+        # Const declarations
+        (r'\bconst\s+f32\s+(\w+)\s*=', r'const \1: f32 ='),
+        (r'\bconst\s+vec2<f32>\s+(\w+)\s*=', r'const \1 ='),
+        (r'\bconst\s+vec3<f32>\s+(\w+)\s*=', r'const \1 ='),
+        (r'\bconst\s+vec4<f32>\s+(\w+)\s*=', r'const \1 ='),
+
+        # Function calls that need fixing
+        (r'\bfract\s*\(', 'fract('),
+        (r'\bmod\s*\(([^,]+),\s*([^)]+)\)', r'(\1 % \2)'),
+    ]
+
+    converted_helpers = helpers
+    for pattern, replacement in conversions:
+        converted_helpers = re.sub(pattern, replacement, converted_helpers)
+
+    # Convert mainImage body
+    converted_main = main_body
+    for pattern, replacement in conversions:
+        converted_main = re.sub(pattern, replacement, converted_main)
+
+    # Fix fragColor assignments -> returns
+    converted_main = re.sub(r'\bfragColor\s*=\s*([^;]+);', r'return \1;', converted_main)
+
+    # Indent main body
+    indented_main = '\n'.join('  ' + line if line.strip() else '' for line in converted_main.split('\n'))
+
+    # Build fragment function with Y-flip for ShaderToy convention
+    fragment = f"""@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {{
+  // Flip Y to match ShaderToy convention (origin at bottom-left)
+  let flipped = vec2<f32>(p.x, uniforms.resolution.y - p.y);
+  let q = flipped / uniforms.resolution;
+  var coord = -1.0 + 2.0 * q;
+  coord.x *= uniforms.resolution.x / uniforms.resolution.y;
+
+{indented_main}
+}}"""
+
+    return converted_helpers + '\n\n' + fragment
+
+def extract_main_image(shader_code):
+    """Extract mainImage function body from ShaderToy code."""
+    # Try to find mainImage function
+    match = re.search(r'void\s+mainImage\s*\([^)]+\)\s*\{(.*)\}', shader_code, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+
+    # If no mainImage found, return whole shader
+    return shader_code
+
+def generate_header(effect_name, is_post_process=False):
+    """Generate .h file content."""
+    class_name = f"{effect_name}Effect"
+    upper_name = to_upper_snake_case(effect_name)
+
+    if is_post_process:
+        return f"""// This file is part of the 64k demo project.
+// {effect_name} effect - ShaderToy conversion (post-process)
+// Generated by convert_shadertoy.py
+
+#ifndef {upper_name}_EFFECT_H_
+#define {upper_name}_EFFECT_H_
+
+#include "gpu/effect.h"
+#include "gpu/effects/post_process_helper.h"
+
+class {class_name} : public PostProcessEffect {{
+ public:
+  {class_name}(const GpuContext& ctx);
+  void render(WGPURenderPassEncoder pass, float time, float beat,
+              float intensity, float aspect_ratio) override;
+  void update_bind_group(WGPUTextureView input_view) override;
+}};
+
+#endif /* {upper_name}_EFFECT_H_ */
+"""
+    else:
+        # Scene effect (simpler, like HeptagonEffect)
+        return f"""// This file is part of the 64k demo project.
+// {effect_name} effect - ShaderToy conversion (scene)
+// Generated by convert_shadertoy.py
+
+#ifndef {upper_name}_EFFECT_H_
+#define {upper_name}_EFFECT_H_
+
+#include "gpu/effect.h"
+
+class {class_name} : public Effect {{
+ public:
+  {class_name}(const GpuContext& ctx);
+  void render(WGPURenderPassEncoder pass, float time, float beat,
+              float intensity, float aspect_ratio) override;
+
+ private:
+  RenderPass pass_;
+}};
+
+#endif /* {upper_name}_EFFECT_H_ */
+"""
+
+def generate_implementation(effect_name, is_post_process=False):
+    """Generate .cc file content."""
+    class_name = f"{effect_name}Effect"
+    snake_name = to_snake_case(effect_name)
+
+    if is_post_process:
+        return f"""// This file is part of the 64k demo project.
+// {effect_name} effect - ShaderToy conversion (post-process)
+// Generated by convert_shadertoy.py
+
+#include "gpu/demo_effects.h"
+#include "gpu/effects/post_process_helper.h"
+#include "gpu/gpu.h"
+
+{class_name}::{class_name}(const GpuContext& ctx) : PostProcessEffect(ctx) {{
+  pipeline_ = create_post_process_pipeline(ctx_.device, ctx_.format, {snake_name}_shader_wgsl);
+}}
+
+void {class_name}::render(WGPURenderPassEncoder pass, float time, float beat,
+                          float intensity, float aspect_ratio) {{
+  const CommonPostProcessUniforms u = {{
+      .resolution = {{(float)width_, (float)height_}},
+      ._pad = {{0.0f, 0.0f}},
+      .aspect_ratio = aspect_ratio,
+      .time = time,
+      .beat = beat,
+      .audio_intensity = intensity,
+  }};
+  uniforms_.update(ctx_.queue, u);
+
+  wgpuRenderPassEncoderSetPipeline(pass, pipeline_);
+  wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr);
+  wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+}}
+
+void {class_name}::update_bind_group(WGPUTextureView input_view) {{
+  pp_update_bind_group(ctx_.device, pipeline_, &bind_group_, input_view, uniforms_.get());
+}}
+"""
+    else:
+        # Scene effect (simpler pattern like HeptagonEffect)
+        return f"""// This file is part of the 64k demo project.
+// {effect_name} effect - ShaderToy conversion (scene)
+// Generated by convert_shadertoy.py
+
+#include "gpu/demo_effects.h"
+#include "gpu/gpu.h"
+
+{class_name}::{class_name}(const GpuContext& ctx) : Effect(ctx) {{
+  ResourceBinding bindings[] = {{{{uniforms_.get(), WGPUBufferBindingType_Uniform}}}};
+  pass_ = gpu_create_render_pass(ctx_.device, ctx_.format, {snake_name}_shader_wgsl,
+                                 bindings, 1);
+  pass_.vertex_count = 3;
+}}
+
+void {class_name}::render(WGPURenderPassEncoder pass, float t, float b,
+                          float i, float a) {{
+  CommonPostProcessUniforms u = {{
+      .resolution = {{(float)width_, (float)height_}},
+      ._pad = {{0.0f, 0.0f}},
+      .aspect_ratio = a,
+      .time = t,
+      .beat = b,
+      .audio_intensity = i,
+  }};
+  uniforms_.update(ctx_.queue, u);
+  wgpuRenderPassEncoderSetPipeline(pass, pass_.pipeline);
+  wgpuRenderPassEncoderSetBindGroup(pass, 0, pass_.bind_group, 0, nullptr);
+  wgpuRenderPassEncoderDraw(pass, pass_.vertex_count, 1, 0, 0);
+}}
+"""
+
+def generate_shader(effect_name, shadertoy_code, is_post_process=False):
+    """Generate .wgsl file content."""
+    # Convert to WGSL (full shader, not just mainImage)
+    converted = convert_shadertoy_to_wgsl(shadertoy_code)
+
+    if is_post_process:
+        bindings = """@group(0) @binding(0) var smplr: sampler;
+@group(0) @binding(1) var txt: texture_2d<f32>;
+
+#include "common_uniforms"
+
+@group(0) @binding(2) var<uniform> uniforms: CommonUniforms;"""
+    else:
+        # Scene effect - only uniforms, no texture input
+        bindings = """#include "common_uniforms"
+
+@group(0) @binding(0) var<uniform> uniforms: CommonUniforms;"""
+
+    return f"""// {effect_name} effect shader - ShaderToy conversion
+// Generated by convert_shadertoy.py
+// NOTE: Manual review recommended - conversion is basic
+
+{bindings}
+
+@vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {{
+    var pos = array<vec2<f32>, 3>(
+        vec2<f32>(-1.0, -1.0),
+        vec2<f32>(3.0, -1.0),
+        vec2<f32>(-1.0, 3.0)
+    );
+    return vec4<f32>(pos[i], 0.0, 1.0);
+}}
+
+{converted}
+"""
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: convert_shadertoy.py <shader.txt> <EffectName> [--post-process] [--shader-only]")
+        print()
+        print("Examples:")
+        print("  ./tools/shadertoy/convert_shadertoy.py tunnel.txt Tunnel")
+        print("  ./tools/shadertoy/convert_shadertoy.py blur.txt Blur --post-process")
+        print("  ./tools/shadertoy/convert_shadertoy.py tunnel.txt Tunnel --shader-only")
+        print()
+        print("Options:")
+        print("  --post-process  Generate post-process effect (operates on previous frame)")
+        print("                  Default: scene effect (renders geometry)")
+        print("  --shader-only   Only regenerate .wgsl shader (skip .h/.cc files)")
+        print()
+        print("This will generate:")
+        print("  src/gpu/effects/<effect>_effect.h")
+        print("  src/gpu/effects/<effect>_effect.cc")
+        print("  workspaces/main/shaders/<effect>.wgsl")
+        sys.exit(1)
+
+    shader_file = sys.argv[1]
+    effect_name = sys.argv[2]
+    is_post_process = '--post-process' in sys.argv
+    shader_only = '--shader-only' in sys.argv
+
+    # Ensure effect name is CamelCase
+    if '_' in effect_name:
+        effect_name = to_camel_case(effect_name)
+
+    # Read shader code
+    if not os.path.exists(shader_file):
+        print(f"Error: {shader_file} not found")
+        sys.exit(1)
+
+    with open(shader_file, 'r') as f:
+        shadertoy_code = f.read()
+
+    # Generate file names
+    snake_name = to_snake_case(effect_name)
+    upper_name = to_upper_snake_case(effect_name)
+
+    # Script is in tools/shadertoy/, so go up two levels to repo root
+    repo_root = Path(__file__).parent.parent.parent
+    header_path = repo_root / "src" / "gpu" / "effects" / f"{snake_name}_effect.h"
+    impl_path = repo_root / "src" / "gpu" / "effects" / f"{snake_name}_effect.cc"
+    shader_path = repo_root / "workspaces" / "main" / "shaders" / f"{snake_name}.wgsl"
+
+    # Generate files
+    if shader_only:
+        print(f"Regenerating shader only: {effect_name}")
+        print(f"  Shader:     {shader_path}")
+        print()
+        shader_path.write_text(generate_shader(effect_name, shadertoy_code, is_post_process))
+        print(f"✓ Shader regenerated")
+        return
+
+    print(f"Generating effect: {effect_name}")
+    print(f"  Header:     {header_path}")
+    print(f"  Impl:       {impl_path}")
+    print(f"  Shader:     {shader_path}")
+    print()
+
+    # Write files
+    header_path.write_text(generate_header(effect_name, is_post_process))
+    impl_path.write_text(generate_implementation(effect_name, is_post_process))
+    shader_path.write_text(generate_shader(effect_name, shadertoy_code, is_post_process))
+
+    effect_type = "post-process" if is_post_process else "scene"
+    print(f"✓ Files generated ({effect_type} effect)")
+    print()
+    print("Next steps (see doc/EFFECT_WORKFLOW.md for details):")
+    print()
+    print("1. Add shader to workspaces/main/assets.txt:")
+    print(f"   SHADER_{upper_name}, NONE, shaders/{snake_name}.wgsl, \"{effect_name} effect\"")
+    print()
+    print()
+    print("2. Add shader declaration to src/gpu/effects/shaders.h:")
+    print(f"   extern const char* {snake_name}_shader_wgsl;")
+    print()
+    print("3. Add shader definition to src/gpu/effects/shaders.cc:")
+    print(f"   const char* {snake_name}_shader_wgsl = SafeGetAsset(AssetId::ASSET_SHADER_{upper_name});")
+    print()
+    print("4. Include header in src/gpu/demo_effects.h:")
+    print(f'   #include "gpu/effects/{snake_name}_effect.h"')
+    print()
+    print("5. Add to timeline in workspaces/main/timeline.seq:")
+    print(f"   EFFECT + {effect_name}Effect 0.0 10.0")
+    print()
+    print("6. Add to CMakeLists.txt GPU_SOURCES (both headless and normal mode):")
+    print(f"   src/gpu/effects/{snake_name}_effect.cc")
+    print()
+    print("7. Update src/tests/gpu/test_demo_effects.cc:")
+    test_list = "post_process_effects" if is_post_process else "scene_effects"
+    print(f'   - Add "{{{effect_name}Effect", std::make_shared<{effect_name}Effect>(fixture.ctx())}}" to {test_list} list')
+    print()
+    print("8. Build and test:")
+    print("   cmake --build build -j4")
+    print("   ./build/demo64k")
+    print()
+    print("Note: Review generated shader for const expression issues (normalize, etc)")
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/shadertoy/example.txt b/tools/shadertoy/example.txt
new file mode 100644
index 0000000..e0287de
--- /dev/null
+++ b/tools/shadertoy/example.txt
@@ -0,0 +1,25 @@
+// Example ShaderToy shader for testing convert_shadertoy.py
+// Simple animated gradient effect
+//
+// Test with:
+//   ./tools/shadertoy/convert_shadertoy.py tools/shadertoy/example.txt Rainbow
+
+void mainImage(out vec4 fragColor, in vec2 fragCoord) {
+    // Normalized pixel coordinates (from 0 to 1)
+    vec2 uv = fragCoord / iResolution.xy;
+
+    // Center coordinates
+    vec2 center = uv - 0.5;
+
+    // Distance from center
+    float dist = length(center);
+
+    // Animated rainbow colors
+    vec3 col = 0.5 + 0.5 * cos(iTime + dist * 10.0 + vec3(0.0, 2.0, 4.0));
+
+    // Pulsing effect
+    col *= 1.0 + 0.2 * sin(iTime * 2.0);
+
+    // Output to screen
+    fragColor = vec4(col, 1.0);
+}
diff --git a/tools/shadertoy/template.cc b/tools/shadertoy/template.cc
new file mode 100644
index 0000000..288283d
--- /dev/null
+++ b/tools/shadertoy/template.cc
@@ -0,0 +1,120 @@
+// This file is part of the 64k demo project.
+// ShaderToy effect implementation - REPLACE THIS LINE
+// TODO: Update description, rename class
+
+#include "gpu/effects/shadertoy_effect.h"
+#include "gpu/effects/shader_composer.h"
+#include "generated/assets.h"
+
+// TODO: Rename class and adjust constructor parameters
+ShaderToyEffect::ShaderToyEffect(const GpuContext& ctx) : Effect(ctx) {
+}
+
+ShaderToyEffect::~ShaderToyEffect() {
+  if (sampler_)
+    wgpuSamplerRelease(sampler_);
+  if (bind_group_)
+    wgpuBindGroupRelease(bind_group_);
+  if (pipeline_)
+    wgpuRenderPipelineRelease(pipeline_);
+}
+
+void ShaderToyEffect::init(MainSequence* demo) {
+  demo_ = demo;
+  params_.init(ctx_.device);
+
+  WGPUSamplerDescriptor sampler_desc = {};
+  sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge;
+  sampler_desc.addressModeV = WGPUAddressMode_ClampToEdge;
+  sampler_desc.magFilter = WGPUFilterMode_Linear;
+  sampler_desc.minFilter = WGPUFilterMode_Linear;
+  sampler_desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
+  sampler_desc.maxAnisotropy = 1;
+  sampler_ = wgpuDeviceCreateSampler(ctx_.device, &sampler_desc);
+
+  // TODO: Update asset name to match your shader file
+  size_t shader_size;
+  const char* shader_code = (const char*)GetAsset(
+      AssetId::ASSET_SHADERTOY_SHADER, &shader_size);
+
+  std::string composed = ShaderComposer::Get().Compose({}, shader_code);
+
+  WGPUShaderSourceWGSL wgsl = {};
+  wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl.code = str_view(composed.c_str());
+
+  WGPUShaderModuleDescriptor desc = {};
+  desc.nextInChain = &wgsl.chain;
+  WGPUShaderModule module = wgpuDeviceCreateShaderModule(ctx_.device, &desc);
+
+  const WGPUColorTargetState target = {
+      .format = ctx_.format,
+      .writeMask = WGPUColorWriteMask_All,
+  };
+  WGPUFragmentState frag = {};
+  frag.module = module;
+  frag.entryPoint = str_view("fs_main");
+  frag.targetCount = 1;
+  frag.targets = &target;
+
+  const WGPUDepthStencilState depth_stencil = {
+      .format = WGPUTextureFormat_Depth24Plus,
+      .depthWriteEnabled = WGPUOptionalBool_False,
+      .depthCompare = WGPUCompareFunction_Always,
+  };
+
+  WGPURenderPipelineDescriptor pipeline_desc = {};
+  pipeline_desc.label = label_view("ShaderToyEffect");
+  pipeline_desc.vertex.module = module;
+  pipeline_desc.vertex.entryPoint = str_view("vs_main");
+  pipeline_desc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
+  pipeline_desc.primitive.cullMode = WGPUCullMode_None;
+  pipeline_desc.depthStencil = &depth_stencil;
+  pipeline_desc.multisample.count = 1;
+  pipeline_desc.multisample.mask = 0xFFFFFFFF;
+  pipeline_desc.fragment = &frag;
+
+  pipeline_ = wgpuDeviceCreateRenderPipeline(ctx_.device, &pipeline_desc);
+  wgpuShaderModuleRelease(module);
+
+  WGPUTextureView prev_view = demo_->get_prev_texture_view();
+  const WGPUBindGroupEntry entries[] = {
+      {.binding = 0, .sampler = sampler_},
+      {.binding = 1, .textureView = prev_view},
+      {.binding = 2,
+       .buffer = uniforms_.get().buffer,
+       .size = sizeof(CommonPostProcessUniforms)},
+      {.binding = 3,
+       .buffer = params_.get().buffer,
+       .size = sizeof(ShaderToyParams)},
+  };
+  const WGPUBindGroupDescriptor bg_desc = {
+      .layout = wgpuRenderPipelineGetBindGroupLayout(pipeline_, 0),
+      .entryCount = 4,
+      .entries = entries,
+  };
+  bind_group_ = wgpuDeviceCreateBindGroup(ctx_.device, &bg_desc);
+}
+
+void ShaderToyEffect::render(WGPURenderPassEncoder pass, float time,
+                              float beat, float intensity, float aspect_ratio) {
+  const CommonPostProcessUniforms uniforms = {
+      .resolution = {static_cast<float>(width_), static_cast<float>(height_)},
+      .aspect_ratio = aspect_ratio,
+      .time = time,
+      .beat = beat,
+      .audio_intensity = intensity,
+  };
+  uniforms_.update(ctx_.queue, uniforms);
+
+  // TODO: Update parameters based on your effect
+  const ShaderToyParams params = {
+      .param1 = 1.0f,
+      .param2 = beat,
+  };
+  params_.update(ctx_.queue, params);
+
+  wgpuRenderPassEncoderSetPipeline(pass, pipeline_);
+  wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr);
+  wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+}
diff --git a/tools/shadertoy/template.h b/tools/shadertoy/template.h
new file mode 100644
index 0000000..2e4af5f
--- /dev/null
+++ b/tools/shadertoy/template.h
@@ -0,0 +1,41 @@
+// This file is part of the 64k demo project.
+// ShaderToy effect boilerplate - REPLACE THIS LINE WITH DESCRIPTION
+// TODO: Update description, rename class, adjust parameters
+
+#ifndef SHADERTOY_EFFECT_H_
+#define SHADERTOY_EFFECT_H_
+
+#include "gpu/effect.h"
+#include "gpu/effects/post_process_helper.h"
+#include "gpu/uniform_helper.h"
+
+// TODO: Rename class to match your effect (e.g., TunnelEffect, PlasmaEffect)
+class ShaderToyEffect : public Effect {
+ public:
+  // TODO: Add constructor parameters for tunable values
+  ShaderToyEffect(const GpuContext& ctx);
+  ~ShaderToyEffect() override;
+
+  void init(MainSequence* demo) override;
+  void render(WGPURenderPassEncoder pass, float time, float beat,
+              float intensity, float aspect_ratio) override;
+
+ private:
+  // TODO: Add effect-specific parameters here
+  // Must match WGSL struct exactly - use padding for 16-byte alignment
+  struct ShaderToyParams {
+    float param1;
+    float param2;
+    float _pad[2];  // Padding to 16 bytes
+  };
+  static_assert(sizeof(ShaderToyParams) == 16,
+                "ShaderToyParams must be 16 bytes for WGSL alignment");
+
+  MainSequence* demo_ = nullptr;
+  WGPURenderPipeline pipeline_ = nullptr;
+  WGPUBindGroup bind_group_ = nullptr;
+  WGPUSampler sampler_ = nullptr;
+  UniformBuffer<ShaderToyParams> params_;
+};
+
+#endif /* SHADERTOY_EFFECT_H_ */
diff --git a/tools/shadertoy/template.wgsl b/tools/shadertoy/template.wgsl
new file mode 100644
index 0000000..37e7def
--- /dev/null
+++ b/tools/shadertoy/template.wgsl
@@ -0,0 +1,90 @@
+// ShaderToy conversion template for 64k demo project
+// TODO: Paste ShaderToy mainImage() function below and adapt
+
+@group(0) @binding(0) var smplr: sampler;
+@group(0) @binding(1) var txt: texture_2d<f32>;
+
+#include "common_uniforms"
+
+@group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
+
+// TODO: Define your effect parameters (must match C++ struct)
+struct ShaderToyParams {
+    param1: f32,
+    param2: f32,
+    _pad0: f32,
+    _pad1: f32,
+}
+
+@group(0) @binding(3) var<uniform> params: ShaderToyParams;
+
+// Standard fullscreen triangle vertex shader
+@vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {
+    var pos = array<vec2<f32>, 3>(
+        vec2<f32>(-1.0, -1.0),
+        vec2<f32>(3.0, -1.0),
+        vec2<f32>(-1.0, 3.0)
+    );
+    return vec4<f32>(pos[i], 0.0, 1.0);
+}
+
+// ============================================================================
+// PASTE SHADERTOY CODE HERE
+// ============================================================================
+// ShaderToy → WGSL conversion notes:
+//
+// 1. Replace ShaderToy uniforms:
+//    iResolution.xy        → uniforms.resolution
+//    iTime                 → uniforms.time
+//    fragCoord             → p.xy (from @builtin(position))
+//    fragColor             → return value
+//
+// 2. Coordinate conversion:
+//    vec2 uv = fragCoord / iResolution.xy;
+//    becomes:
+//    let uv = p.xy / uniforms.resolution;
+//
+// 3. Type syntax changes:
+//    float                 → f32
+//    vec2/vec3/vec4        → vec2<f32>, vec3<f32>, vec4<f32>
+//    mat2/mat3/mat4        → mat2x2<f32>, mat3x3<f32>, mat4x4<f32>
+//
+// 4. Function syntax:
+//    float foo(vec2 p)     → fn foo(p: vec2<f32>) -> f32
+//
+// 5. Common functions (mostly same):
+//    mix, sin, cos, length, normalize, dot, cross, etc.
+//    fract()               → fract()
+//    mod(x, y)             → x % y  OR  x - y * floor(x / y)
+//
+// 6. Texture sampling:
+//    texture(iChannel0, uv) → textureSample(txt, smplr, uv)
+//
+// 7. Variable declarations:
+//    float x = 1.0;        → var x: f32 = 1.0;  OR  let x = 1.0;
+//    const float x = 1.0;  → const x: f32 = 1.0;
+//
+// 8. Swizzling is the same: col.rgb, uv.xy, etc.
+//
+// ============================================================================
+
+@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
+    // TODO: Paste and adapt ShaderToy mainImage() body here
+
+    // Example coordinate setup (typical ShaderToy pattern):
+    let uv = p.xy / uniforms.resolution;
+
+    // TODO: Your effect code here
+    var col = vec3<f32>(uv.x, uv.y, 0.5);
+
+    // Optional: Sample previous frame
+    // var prev_col = textureSample(txt, smplr, uv);
+
+    // Optional: Audio reactivity
+    // col *= 1.0 + uniforms.audio_intensity * 0.2;
+
+    // Optional: Beat sync
+    // col *= 1.0 + uniforms.beat * 0.1;
+
+    return vec4<f32>(col, 1.0);
+}
diff --git a/training/ground_truth.png b/training/ground_truth.png
new file mode 100644
index 0000000..6e1f2aa
--- /dev/null
+++ b/training/ground_truth.png
diff --git a/training/train_cnn.py b/training/train_cnn.py
index 1cd6579..16f8e7a 100755
--- a/training/train_cnn.py
+++ b/training/train_cnn.py
@@ -5,10 +5,15 @@ CNN Training Script for Image-to-Image Transformation
 Trains a convolutional neural network on multiple input/target image pairs.
 
 Usage:
+    # Training
     python3 train_cnn.py --input input_dir/ --target target_dir/ [options]
 
+    # Inference (generate ground truth)
+    python3 train_cnn.py --infer image.png --export-only checkpoint.pth --output result.png
+
 Example:
     python3 train_cnn.py --input ./input --target ./output --layers 3 --epochs 100
+    python3 train_cnn.py --infer input.png --export-only checkpoints/checkpoint_epoch_10000.pth
 """
 
 import torch
@@ -62,7 +67,8 @@ class ImagePairDataset(Dataset):
     def __getitem__(self, idx):
         input_path, target_path = self.image_pairs[idx]
 
-        input_img = Image.open(input_path).convert('RGB')
+        # Load RGBD input (4 channels: RGB + Depth)
+        input_img = Image.open(input_path).convert('RGBA')
         target_img = Image.open(target_path).convert('RGB')
 
         if self.transform:
@@ -72,27 +78,8 @@ class ImagePairDataset(Dataset):
         return input_img, target_img
 
 
-class CoordConv2d(nn.Module):
-    """Conv2d that accepts coordinate input separate from spatial patches"""
-
-    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
-        super().__init__()
-        self.conv_rgba = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, bias=False)
-        self.coord_weights = nn.Parameter(torch.randn(out_channels, 2) * 0.01)
-        self.bias = nn.Parameter(torch.zeros(out_channels))
-
-    def forward(self, x, coords):
-        # x: [B, C, H, W] image
-        # coords: [B, 2, H, W] coordinate grid
-        out = self.conv_rgba(x)
-        B, C, H, W = out.shape
-        coord_contrib = torch.einsum('bchw,oc->bohw', coords, self.coord_weights)
-        out = out + coord_contrib + self.bias.view(1, -1, 1, 1)
-        return out
-
-
 class SimpleCNN(nn.Module):
-    """Simple CNN for image-to-image transformation"""
+    """CNN for RGBD→grayscale with 7-channel input (RGBD + UV + gray)"""
 
     def __init__(self, num_layers=1, kernel_sizes=None):
         super(SimpleCNN, self).__init__()
@@ -107,26 +94,46 @@ class SimpleCNN(nn.Module):
 
         for i, kernel_size in enumerate(kernel_sizes):
             padding = kernel_size // 2
-            if i == 0:
-                self.layers.append(CoordConv2d(3, 3, kernel_size, padding=padding))
+            if i < num_layers - 1:
+                # Inner layers: 7→4 (RGBD output)
+                self.layers.append(nn.Conv2d(7, 4, kernel_size=kernel_size, padding=padding, bias=True))
             else:
-                self.layers.append(nn.Conv2d(3, 3, kernel_size=kernel_size, padding=padding, bias=True))
+                # Final layer: 7→1 (grayscale output)
+                self.layers.append(nn.Conv2d(7, 1, kernel_size=kernel_size, padding=padding, bias=True))
 
     def forward(self, x):
+        # x: [B,4,H,W] - RGBD input (D = 1/z)
         B, C, H, W = x.shape
+
+        # Normalize RGBD to [-1,1]
+        x_norm = (x - 0.5) * 2.0
+
+        # Compute coordinates [0,1] then normalize to [-1,1]
         y_coords = torch.linspace(0, 1, H, device=x.device).view(1,1,H,1).expand(B,1,H,W)
         x_coords = torch.linspace(0, 1, W, device=x.device).view(1,1,1,W).expand(B,1,H,W)
-        coords = torch.cat([x_coords, y_coords], dim=1)
+        y_coords = (y_coords - 0.5) * 2.0  # [-1,1]
+        x_coords = (x_coords - 0.5) * 2.0  # [-1,1]
 
-        out = self.layers[0](x, coords)
-        out = torch.tanh(out)
+        # Compute grayscale from original RGB (Rec.709) and normalize to [-1,1]
+        gray = 0.2126*x[:,0:1] + 0.7152*x[:,1:2] + 0.0722*x[:,2:3]  # [B,1,H,W] in [0,1]
+        gray = (gray - 0.5) * 2.0  # [-1,1]
 
-        for i in range(1, len(self.layers)):
-            out = self.layers[i](out)
-            if i < len(self.layers) - 1:
-                out = torch.tanh(out)
+        # Layer 0
+        layer0_input = torch.cat([x_norm, x_coords, y_coords, gray], dim=1)  # [B,7,H,W]
+        out = self.layers[0](layer0_input)  # [B,4,H,W]
+        out = torch.tanh(out)  # [-1,1]
 
-        return out
+        # Inner layers
+        for i in range(1, len(self.layers)-1):
+            layer_input = torch.cat([out, x_coords, y_coords, gray], dim=1)
+            out = self.layers[i](layer_input)
+            out = torch.tanh(out)
+
+        # Final layer (grayscale output)
+        final_input = torch.cat([out, x_coords, y_coords, gray], dim=1)
+        out = self.layers[-1](final_input)  # [B,1,H,W]
+        out = torch.clamp(out, 0.0, 1.0)  # Clip to [0,1]
+        return out.expand(-1, 3, -1, -1)
 
 
 def generate_layer_shader(output_path, num_layers, kernel_sizes):
@@ -163,37 +170,49 @@ def generate_layer_shader(output_path, num_layers, kernel_sizes):
         f.write("}\n\n")
         f.write("@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {\n")
         f.write("    let uv = p.xy / uniforms.resolution;\n")
-        f.write("    let input = textureSample(txt, smplr, uv);\n")
-        f.write("    let original = textureSample(original_input, smplr, uv);\n")
+        f.write("    let original_raw = textureSample(original_input, smplr, uv);\n")
+        f.write("    let original = (original_raw - 0.5) * 2.0;  // Normalize to [-1,1]\n")
         f.write("    var result = vec4<f32>(0.0);\n\n")
 
         # Generate layer switches
         for layer_idx in range(num_layers):
+            is_final = layer_idx == num_layers - 1
             ks = kernel_sizes[layer_idx]
+            conv_fn = f"cnn_conv{ks}x{ks}_7to4" if not is_final else f"cnn_conv{ks}x{ks}_7to1"
+
             if layer_idx == 0:
-                f.write(f"    // Layer 0 uses coordinate-aware convolution\n")
+                conv_fn_src = f"cnn_conv{ks}x{ks}_7to4_src"
+                f.write(f"    // Layer 0: 7→4 (RGBD output, normalizes [0,1] input)\n")
                 f.write(f"    if (params.layer_index == {layer_idx}) {{\n")
-                f.write(f"        result = cnn_conv{ks}x{ks}_with_coord(txt, smplr, uv, uniforms.resolution,\n")
-                f.write(f"                                        rgba_weights_layer{layer_idx}, coord_weights_layer{layer_idx}, bias_layer{layer_idx});\n")
+                f.write(f"        result = {conv_fn_src}(txt, smplr, uv, uniforms.resolution,\n")
+                f.write(f"                                         weights_layer{layer_idx});\n")
                 f.write(f"        result = cnn_tanh(result);\n")
                 f.write(f"    }}\n")
+            elif not is_final:
+                f.write(f"    else if (params.layer_index == {layer_idx}) {{\n")
+                f.write(f"        result = {conv_fn}(txt, smplr, uv, uniforms.resolution,\n")
+                f.write(f"                                   original, weights_layer{layer_idx});\n")
+                f.write(f"        result = cnn_tanh(result);  // Keep in [-1,1]\n")
+                f.write(f"    }}\n")
             else:
-                is_last = layer_idx == num_layers - 1
-                f.write(f"    {'else ' if layer_idx > 0 else ''}if (params.layer_index == {layer_idx}) {{\n")
-                f.write(f"        result = cnn_conv{ks}x{ks}(txt, smplr, uv, uniforms.resolution,\n")
-                f.write(f"                                   weights_layer{layer_idx}, bias_layer{layer_idx});\n")
-                if not is_last:
-                    f.write(f"        result = cnn_tanh(result);\n")
+                f.write(f"    else if (params.layer_index == {layer_idx}) {{\n")
+                f.write(f"        let gray_out = {conv_fn}(txt, smplr, uv, uniforms.resolution,\n")
+                f.write(f"                                         original, weights_layer{layer_idx});\n")
+                f.write(f"        // gray_out already in [0,1] from clipped training\n")
+                f.write(f"        let original_denorm = (original + 1.0) * 0.5;\n")
+                f.write(f"        result = vec4<f32>(gray_out, gray_out, gray_out, 1.0);\n")
+                f.write(f"        let blended = mix(original_denorm, result, params.blend_amount);\n")
+                f.write(f"        return blended;  // [0,1]\n")
                 f.write(f"    }}\n")
 
         # Add else clause for invalid layer index
-        if num_layers > 1:
+        if num_layers > 0:
             f.write(f"    else {{\n")
-            f.write(f"        result = input;\n")
+            f.write(f"        return textureSample(txt, smplr, uv);\n")
             f.write(f"    }}\n")
 
-        f.write("\n    // Blend with ORIGINAL input from layer 0\n")
-        f.write("    return mix(original, result, params.blend_amount);\n")
+        f.write("\n    // Non-final layers: denormalize for display\n")
+        f.write("    return (result + 1.0) * 0.5;  // [-1,1] → [0,1]\n")
         f.write("}\n")
 
 
@@ -204,95 +223,95 @@ def export_weights_to_wgsl(model, output_path, kernel_sizes):
         f.write("// Auto-generated CNN weights\n")
         f.write("// DO NOT EDIT - Generated by train_cnn.py\n\n")
 
-        layer_idx = 0
         for i, layer in enumerate(model.layers):
-            if isinstance(layer, CoordConv2d):
-                # Export RGBA weights
-                weights = layer.conv_rgba.weight.data.cpu().numpy()
-                kernel_size = kernel_sizes[layer_idx]
-                out_ch, in_ch, kh, kw = weights.shape
-                num_positions = kh * kw
+            weights = layer.weight.data.cpu().numpy()
+            bias = layer.bias.data.cpu().numpy()
+            out_ch, in_ch, kh, kw = weights.shape
+            num_positions = kh * kw
 
-                f.write(f"const rgba_weights_layer{layer_idx}: array<mat4x4<f32>, {num_positions}> = array(\n")
+            is_final = (i == len(model.layers) - 1)
+
+            if is_final:
+                # Final layer: 7→1, structure: array<array<f32, 8>, 9>
+                # [w0, w1, w2, w3, w4, w5, w6, bias]
+                f.write(f"const weights_layer{i}: array<array<f32, 8>, {num_positions}> = array(\n")
                 for pos in range(num_positions):
-                    row = pos // kw
-                    col = pos % kw
-                    f.write("  mat4x4<f32>(\n")
-                    for out_c in range(4):
-                        vals = []
-                        for in_c in range(4):
-                            if out_c < out_ch and in_c < in_ch:
-                                vals.append(f"{weights[out_c, in_c, row, col]:.6f}")
-                            else:
-                                vals.append("0.0")
-                        f.write(f"    {', '.join(vals)},\n")
-                    f.write("  )")
-                    if pos < num_positions - 1:
-                        f.write(",\n")
-                    else:
-                        f.write("\n")
+                    row, col = pos // kw, pos % kw
+                    vals = [f"{weights[0, in_c, row, col]:.6f}" for in_c in range(7)]
+                    vals.append(f"{bias[0]:.6f}")  # Append bias as 8th element
+                    f.write(f"  array<f32, 8>({', '.join(vals)})")
+                    f.write(",\n" if pos < num_positions-1 else "\n")
                 f.write(");\n\n")
-
-                # Export coordinate weights
-                coord_w = layer.coord_weights.data.cpu().numpy()
-                f.write(f"const coord_weights_layer{layer_idx} = mat2x4<f32>(\n")
-                for c in range(2):
-                    vals = []
+            else:
+                # Inner layers: 7→4, structure: array<array<f32, 8>, 36>
+                # Flattened: [pos0_ch0[7w+bias], pos0_ch1[7w+bias], ..., pos8_ch3[7w+bias]]
+                num_entries = num_positions * 4
+                f.write(f"const weights_layer{i}: array<array<f32, 8>, {num_entries}> = array(\n")
+                for pos in range(num_positions):
+                    row, col = pos // kw, pos % kw
                     for out_c in range(4):
-                        if out_c < coord_w.shape[0]:
-                            vals.append(f"{coord_w[out_c, c]:.6f}")
-                        else:
-                            vals.append("0.0")
-                    f.write(f"  {', '.join(vals)}")
-                    if c < 1:
-                        f.write(",\n")
-                    else:
-                        f.write("\n")
+                        vals = [f"{weights[out_c, in_c, row, col]:.6f}" for in_c in range(7)]
+                        vals.append(f"{bias[out_c]:.6f}")  # Append bias
+                        idx = pos * 4 + out_c
+                        f.write(f"  array<f32, 8>({', '.join(vals)})")
+                        f.write(",\n" if idx < num_entries-1 else "\n")
                 f.write(");\n\n")
 
-                # Export bias
-                bias = layer.bias.data.cpu().numpy()
-                bias_vals = [f"{bias[i]:.6f}" if i < len(bias) else "0.0" for i in range(4)]
-                f.write(f"const bias_layer{layer_idx} = vec4<f32>(")
-                f.write(", ".join(bias_vals))
-                f.write(");\n\n")
 
-                layer_idx += 1
-            elif isinstance(layer, nn.Conv2d):
-                # Standard conv layer
-                weights = layer.weight.data.cpu().numpy()
-                kernel_size = kernel_sizes[layer_idx]
-                out_ch, in_ch, kh, kw = weights.shape
-                num_positions = kh * kw
+def generate_conv_src_function(kernel_size, output_path):
+    """Generate cnn_conv{K}x{K}_7to4_src() function for layer 0"""
 
-                f.write(f"const weights_layer{layer_idx}: array<mat4x4<f32>, {num_positions}> = array(\n")
-                for pos in range(num_positions):
-                    row = pos // kw
-                    col = pos % kw
-                    f.write("  mat4x4<f32>(\n")
-                    for out_c in range(4):
-                        vals = []
-                        for in_c in range(4):
-                            if out_c < out_ch and in_c < in_ch:
-                                vals.append(f"{weights[out_c, in_c, row, col]:.6f}")
-                            else:
-                                vals.append("0.0")
-                        f.write(f"    {', '.join(vals)},\n")
-                    f.write("  )")
-                    if pos < num_positions - 1:
-                        f.write(",\n")
-                    else:
-                        f.write("\n")
-                f.write(");\n\n")
+    k = kernel_size
+    num_positions = k * k
+    radius = k // 2
 
-                # Export bias
-                bias = layer.bias.data.cpu().numpy()
-                bias_vals = [f"{bias[i]:.6f}" if i < len(bias) else "0.0" for i in range(4)]
-                f.write(f"const bias_layer{layer_idx} = vec4<f32>(")
-                f.write(", ".join(bias_vals))
-                f.write(");\n\n")
+    with open(output_path, 'a') as f:
+        f.write(f"\n// Source layer: 7→4 channels (RGBD output)\n")
+        f.write(f"// Normalizes [0,1] input to [-1,1] internally\n")
+        f.write(f"fn cnn_conv{k}x{k}_7to4_src(\n")
+        f.write(f"  tex: texture_2d<f32>,\n")
+        f.write(f"  samp: sampler,\n")
+        f.write(f"  uv: vec2<f32>,\n")
+        f.write(f"  resolution: vec2<f32>,\n")
+        f.write(f"  weights: array<array<f32, 8>, {num_positions * 4}>\n")
+        f.write(f") -> vec4<f32> {{\n")
+        f.write(f"  let step = 1.0 / resolution;\n\n")
+
+        # Normalize center pixel for gray channel
+        f.write(f"  let original = (textureSample(tex, samp, uv) - 0.5) * 2.0;\n")
+        f.write(f"  let gray = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;\n")
+        f.write(f"  let uv_norm = (uv - 0.5) * 2.0;\n\n")
+
+        f.write(f"  var sum = vec4<f32>(0.0);\n")
+        f.write(f"  var pos = 0;\n\n")
+
+        # Convolution loop
+        f.write(f"  for (var dy = -{radius}; dy <= {radius}; dy++) {{\n")
+        f.write(f"    for (var dx = -{radius}; dx <= {radius}; dx++) {{\n")
+        f.write(f"      let offset = vec2<f32>(f32(dx), f32(dy)) * step;\n")
+        f.write(f"      let rgbd = (textureSample(tex, samp, uv + offset) - 0.5) * 2.0;\n\n")
+
+        # 7-channel input
+        f.write(f"      let inputs = array<f32, 7>(\n")
+        f.write(f"        rgbd.r, rgbd.g, rgbd.b, rgbd.a,\n")
+        f.write(f"        uv_norm.x, uv_norm.y, gray\n")
+        f.write(f"      );\n\n")
+
+        # Accumulate
+        f.write(f"      for (var out_c = 0; out_c < 4; out_c++) {{\n")
+        f.write(f"        let idx = pos * 4 + out_c;\n")
+        f.write(f"        var channel_sum = weights[idx][7];\n")
+        f.write(f"        for (var in_c = 0; in_c < 7; in_c++) {{\n")
+        f.write(f"          channel_sum += weights[idx][in_c] * inputs[in_c];\n")
+        f.write(f"        }}\n")
+        f.write(f"        sum[out_c] += channel_sum;\n")
+        f.write(f"      }}\n")
+        f.write(f"      pos++;\n")
+        f.write(f"    }}\n")
+        f.write(f"  }}\n\n")
 
-                layer_idx += 1
+        f.write(f"  return sum;\n")
+        f.write(f"}}\n")
 
 
 def train(args):
@@ -382,6 +401,24 @@ def train(args):
     print(f"Generating layer shader to {shader_path}...")
     generate_layer_shader(shader_path, args.layers, kernel_sizes)
 
+    # Generate _src variants for kernel sizes (skip 3x3, already exists)
+    for ks in set(kernel_sizes):
+        if ks == 3:
+            continue
+        conv_path = os.path.join(shader_dir, f'cnn_conv{ks}x{ks}.wgsl')
+        if not os.path.exists(conv_path):
+            print(f"Warning: {conv_path} not found, skipping _src generation")
+            continue
+
+        # Check if _src already exists
+        with open(conv_path, 'r') as f:
+            content = f.read()
+            if f"cnn_conv{ks}x{ks}_7to4_src" in content:
+                continue
+
+        generate_conv_src_function(ks, conv_path)
+        print(f"Added _src variant to {conv_path}")
+
     print("Training complete!")
 
 
@@ -414,26 +451,94 @@ def export_from_checkpoint(checkpoint_path, output_path=None):
     print(f"Generating layer shader to {shader_path}...")
     generate_layer_shader(shader_path, num_layers, kernel_sizes)
 
+    # Generate _src variants for kernel sizes (skip 3x3, already exists)
+    for ks in set(kernel_sizes):
+        if ks == 3:
+            continue
+        conv_path = os.path.join(shader_dir, f'cnn_conv{ks}x{ks}.wgsl')
+        if not os.path.exists(conv_path):
+            print(f"Warning: {conv_path} not found, skipping _src generation")
+            continue
+
+        # Check if _src already exists
+        with open(conv_path, 'r') as f:
+            content = f.read()
+            if f"cnn_conv{ks}x{ks}_7to4_src" in content:
+                continue
+
+        generate_conv_src_function(ks, conv_path)
+        print(f"Added _src variant to {conv_path}")
+
     print("Export complete!")
 
 
+def infer_from_checkpoint(checkpoint_path, input_path, output_path):
+    """Run inference on single image to generate ground truth"""
+
+    if not os.path.exists(checkpoint_path):
+        print(f"Error: Checkpoint '{checkpoint_path}' not found")
+        sys.exit(1)
+
+    if not os.path.exists(input_path):
+        print(f"Error: Input image '{input_path}' not found")
+        sys.exit(1)
+
+    print(f"Loading checkpoint from {checkpoint_path}...")
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+
+    # Reconstruct model
+    model = SimpleCNN(
+        num_layers=checkpoint['num_layers'],
+        kernel_sizes=checkpoint['kernel_sizes']
+    )
+    model.load_state_dict(checkpoint['model_state'])
+    model.eval()
+
+    # Load image [0,1]
+    print(f"Loading input image: {input_path}")
+    img = Image.open(input_path).convert('RGBA')
+    img_tensor = transforms.ToTensor()(img).unsqueeze(0)  # [1,4,H,W]
+
+    # Inference
+    print("Running inference...")
+    with torch.no_grad():
+        out = model(img_tensor)  # [1,3,H,W] in [0,1]
+
+    # Save
+    print(f"Saving output to: {output_path}")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    transforms.ToPILImage()(out.squeeze(0)).save(output_path)
+    print("Done!")
+
+
 def main():
     parser = argparse.ArgumentParser(description='Train CNN for image-to-image transformation')
-    parser.add_argument('--input', help='Input image directory')
+    parser.add_argument('--input', help='Input image directory (training) or single image (inference)')
     parser.add_argument('--target', help='Target image directory')
     parser.add_argument('--layers', type=int, default=1, help='Number of CNN layers (default: 1)')
     parser.add_argument('--kernel_sizes', default='3', help='Comma-separated kernel sizes (default: 3)')
     parser.add_argument('--epochs', type=int, default=100, help='Number of training epochs (default: 100)')
     parser.add_argument('--batch_size', type=int, default=4, help='Batch size (default: 4)')
     parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate (default: 0.001)')
-    parser.add_argument('--output', help='Output WGSL file path (default: workspaces/main/shaders/cnn/cnn_weights_generated.wgsl)')
+    parser.add_argument('--output', help='Output path (WGSL for training/export, PNG for inference)')
     parser.add_argument('--checkpoint-every', type=int, default=0, help='Save checkpoint every N epochs (default: 0 = disabled)')
     parser.add_argument('--checkpoint-dir', help='Checkpoint directory (default: training/checkpoints)')
     parser.add_argument('--resume', help='Resume from checkpoint file')
     parser.add_argument('--export-only', help='Export WGSL from checkpoint without training')
+    parser.add_argument('--infer', help='Run inference on single image (requires --export-only for checkpoint)')
 
     args = parser.parse_args()
 
+    # Inference mode
+    if args.infer:
+        checkpoint = args.export_only
+        if not checkpoint:
+            print("Error: --infer requires --export-only <checkpoint>")
+            sys.exit(1)
+        output_path = args.output or 'inference_output.png'
+        infer_from_checkpoint(checkpoint, args.infer, output_path)
+        return
+
     # Export-only mode
     if args.export_only:
         export_from_checkpoint(args.export_only, args.output)
diff --git a/workspaces/main/assets.txt b/workspaces/main/assets.txt
index 53c8b3e..af8b9e9 100644
--- a/workspaces/main/assets.txt
+++ b/workspaces/main/assets.txt
@@ -67,3 +67,4 @@ SHADER_COMPUTE_GEN_MASK, NONE, shaders/compute/gen_mask.wgsl, "GPU Mask Composit
 CIRCLE_MASK_COMPUTE_SHADER, NONE, shaders/circle_mask_compute.wgsl, "Circle mask compute shader"
 CIRCLE_MASK_RENDER_SHADER, NONE, shaders/circle_mask_render.wgsl, "Circle mask render shader"
 MASKED_CUBE_SHADER, NONE, shaders/masked_cube.wgsl, "Masked cube shader"
+SHADER_SCENE1, NONE, shaders/scene1.wgsl, "Scene1 effect shader"
diff --git a/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl b/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl
index 168c9e2..96ddf5b 100644
--- a/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl
+++ b/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl
@@ -1,53 +1,148 @@
 // 3x3 convolution with weight indexing
-// Samples 9 pixels, applies mat4 weights per sample
 
-fn cnn_conv3x3(
+// Source layers: 7→4 channels (RGBD output)
+// Assumes 'tex' (the input) is *not* normalized to [-1,1], but is [0,1]
+// UV coordinates remain in [0,1] and are normalized internally
+// weights: array<array<f32, 8>, 36> (9 positions × 4 channels, each with 7 weights + bias)
+fn cnn_conv3x3_7to4_src(
   tex: texture_2d<f32>,
   samp: sampler,
   uv: vec2<f32>,
   resolution: vec2<f32>,
-  weights: array<mat4x4<f32>, 9>,
-  bias: vec4<f32>
+  weights: array<array<f32, 8>, 36>
 ) -> vec4<f32> {
   let step = 1.0 / resolution;
-  var sum = bias;
-  var idx = 0;
 
+  // Compute grayscale from original (converted in [-1,1])
+  let original = (textureSample(tex, samp, uv) - 0.5) * 2.0;
+  let gray = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+
+  // Normalize UV to [-1,1]
+  let uv_norm = (uv - 0.5) * 2.0;
+
+  var sum = vec4<f32>(0.0);
+
+  var pos = 0;
   for (var dy = -1; dy <= 1; dy++) {
     for (var dx = -1; dx <= 1; dx++) {
       let offset = vec2<f32>(f32(dx), f32(dy)) * step;
-      let sample = textureSample(tex, samp, uv + offset);
-      sum += weights[idx] * sample;
-      idx++;
+      let rgbd = (textureSample(tex, samp, uv + offset) - .5) * 2.0;  // convert to [-1,1]
+
+      // 7-channel input: [R,G,B,D, uv.x, uv.y, gray] all in [-1,1]
+      let inputs = array<f32, 7>(
+        rgbd.r, rgbd.g, rgbd.b, rgbd.a,
+        uv_norm.x, uv_norm.y, gray
+      );
+
+      // Accumulate for each output channel (RGBD)
+      for (var out_c = 0; out_c < 4; out_c++) {
+        let idx = pos * 4 + out_c;
+        var channel_sum = weights[idx][7];  // Bias (8th element)
+        for (var in_c = 0; in_c < 7; in_c++) {
+          channel_sum += weights[idx][in_c] * inputs[in_c];
+        }
+        sum[out_c] += channel_sum;
+      }
+
+      pos++;
     }
   }
 
-  return sum;
+  return sum;  // Output in [-1,1] range
 }
 
-fn cnn_conv3x3_with_coord(
+// Inner layers: 7→4 channels (RGBD output)
+// Assumes 'tex' and 'original' are already normalized to [-1,1]
+// UV coordinates remain in [0,1] and are normalized internally
+// weights: array<array<f32, 8>, 36> (9 positions × 4 channels, each with 7 weights + bias)
+fn cnn_conv3x3_7to4(
   tex: texture_2d<f32>,
   samp: sampler,
   uv: vec2<f32>,
   resolution: vec2<f32>,
-  rgba_weights: array<mat4x4<f32>, 9>,
-  coord_weights: mat2x4<f32>,
-  bias: vec4<f32>
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 36>
 ) -> vec4<f32> {
   let step = 1.0 / resolution;
-  var sum = bias;
 
-  sum += coord_weights * uv;
+  // Compute grayscale from original (already in [-1,1])
+  let gray = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+
+  // Normalize UV to [-1,1]
+  let uv_norm = (uv - 0.5) * 2.0;
+
+  var sum = vec4<f32>(0.0);
+
+  var pos = 0;
+  for (var dy = -1; dy <= 1; dy++) {
+    for (var dx = -1; dx <= 1; dx++) {
+      let offset = vec2<f32>(f32(dx), f32(dy)) * step;
+      let rgbd = textureSample(tex, samp, uv + offset);  // Already in [-1,1]
+
+      // 7-channel input: [R,G,B,D, uv.x, uv.y, gray] all in [-1,1]
+      let inputs = array<f32, 7>(
+        rgbd.r, rgbd.g, rgbd.b, rgbd.a,
+        uv_norm.x, uv_norm.y, gray
+      );
+
+      // Accumulate for each output channel (RGBD)
+      for (var out_c = 0; out_c < 4; out_c++) {
+        let idx = pos * 4 + out_c;
+        var channel_sum = weights[idx][7];  // Bias (8th element)
+        for (var in_c = 0; in_c < 7; in_c++) {
+          channel_sum += weights[idx][in_c] * inputs[in_c];
+        }
+        sum[out_c] += channel_sum;
+      }
+
+      pos++;
+    }
+  }
+
+  return sum;  // Output in [-1,1] range
+}
+
+// Final layer: 7→1 channel (scalar output)
+// Assumes 'tex' and 'original' are already normalized to [-1,1]
+// UV coordinates remain in [0,1] and are normalized internally
+// weights: array<array<f32, 8>, 9> (9 positions, each with 7 weights + bias)
+fn cnn_conv3x3_7to1(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 9>
+) -> f32 {
+  let step = 1.0 / resolution;
+
+  // Compute grayscale from original (already in [-1,1])
+  let gray = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+
+  // Normalize UV to [-1,1]
+  let uv_norm = (uv - 0.5) * 2.0;
 
-  var idx = 0;
+  var sum = 0.0;
+
+  var pos = 0;
   for (var dy = -1; dy <= 1; dy++) {
     for (var dx = -1; dx <= 1; dx++) {
       let offset = vec2<f32>(f32(dx), f32(dy)) * step;
-      let rgba = textureSample(tex, samp, uv + offset);
-      sum += rgba_weights[idx] * rgba;
-      idx++;
+      let rgbd = textureSample(tex, samp, uv + offset);  // Already in [-1,1]
+
+      // 7-channel input all in [-1,1]
+      sum += weights[pos][0] * rgbd.r;
+      sum += weights[pos][1] * rgbd.g;
+      sum += weights[pos][2] * rgbd.b;
+      sum += weights[pos][3] * rgbd.a;
+      sum += weights[pos][4] * uv_norm.x;
+      sum += weights[pos][5] * uv_norm.y;
+      sum += weights[pos][6] * gray;
+      sum += weights[pos][7];  // Bias
+
+      pos++;
     }
   }
 
-  return sum;
+  return sum;  // Output in [-1,1]
 }
diff --git a/workspaces/main/shaders/cnn/cnn_conv5x5.wgsl b/workspaces/main/shaders/cnn/cnn_conv5x5.wgsl
index bd9abfa..5136740 100644
--- a/workspaces/main/shaders/cnn/cnn_conv5x5.wgsl
+++ b/workspaces/main/shaders/cnn/cnn_conv5x5.wgsl
@@ -1,53 +1,85 @@
-// 5x5 convolution with 25 samples
-// Applies mat4 weights per sample
-
-fn cnn_conv5x5(
+// 5×5 variant for 7→4 channels (RGBD output)
+// Assumes 'tex' and 'original' are already normalized to [-1,1]
+// UV coordinates remain in [0,1] and are normalized internally
+// weights: array<array<f32, 8>, 100> (25 positions × 4 channels, each with 7 weights + bias)
+fn cnn_conv5x5_7to4(
   tex: texture_2d<f32>,
   samp: sampler,
   uv: vec2<f32>,
   resolution: vec2<f32>,
-  weights: array<mat4x4<f32>, 25>,
-  bias: vec4<f32>
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 100>
 ) -> vec4<f32> {
   let step = 1.0 / resolution;
-  var sum = bias;
-  var idx = 0;
+
+  let gray = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+  let uv_norm = (uv - 0.5) * 2.0;
+
+  var sum = vec4<f32>(0.0);
+  var pos = 0;
 
   for (var dy = -2; dy <= 2; dy++) {
     for (var dx = -2; dx <= 2; dx++) {
       let offset = vec2<f32>(f32(dx), f32(dy)) * step;
-      let sample = textureSample(tex, samp, uv + offset);
-      sum += weights[idx] * sample;
-      idx++;
+      let rgbd = textureSample(tex, samp, uv + offset);  // Already in [-1,1]
+
+      let inputs = array<f32, 7>(
+        rgbd.r, rgbd.g, rgbd.b, rgbd.a,
+        uv_norm.x, uv_norm.y, gray
+      );
+
+      for (var out_c = 0; out_c < 4; out_c++) {
+        let idx = pos * 4 + out_c;
+        var channel_sum = weights[idx][7];
+        for (var in_c = 0; in_c < 7; in_c++) {
+          channel_sum += weights[idx][in_c] * inputs[in_c];
+        }
+        sum[out_c] += channel_sum;
+      }
+      pos++;
     }
   }
 
   return sum;
 }
 
-fn cnn_conv5x5_with_coord(
+// 5×5 variant for 7→1 channel (scalar output)
+// Assumes 'tex' and 'original' are already normalized to [-1,1]
+// UV coordinates remain in [0,1] and are normalized internally
+// weights: array<array<f32, 8>, 25> (25 positions, each with 7 weights + bias)
+fn cnn_conv5x5_7to1(
   tex: texture_2d<f32>,
   samp: sampler,
   uv: vec2<f32>,
   resolution: vec2<f32>,
-  rgba_weights: array<mat4x4<f32>, 25>,
-  coord_weights: mat2x4<f32>,
-  bias: vec4<f32>
-) -> vec4<f32> {
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 25>
+) -> f32 {
   let step = 1.0 / resolution;
-  var sum = bias;
 
-  sum += coord_weights * uv;
+  let gray = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+  let uv_norm = (uv - 0.5) * 2.0;
+
+  var sum = 0.0;
+  var pos = 0;
 
-  var idx = 0;
   for (var dy = -2; dy <= 2; dy++) {
     for (var dx = -2; dx <= 2; dx++) {
       let offset = vec2<f32>(f32(dx), f32(dy)) * step;
-      let rgba = textureSample(tex, samp, uv + offset);
-      sum += rgba_weights[idx] * rgba;
-      idx++;
+      let rgbd = textureSample(tex, samp, uv + offset);  // Already in [-1,1]
+
+      sum += weights[pos][0] * rgbd.r;
+      sum += weights[pos][1] * rgbd.g;
+      sum += weights[pos][2] * rgbd.b;
+      sum += weights[pos][3] * rgbd.a;
+      sum += weights[pos][4] * uv_norm.x;
+      sum += weights[pos][5] * uv_norm.y;
+      sum += weights[pos][6] * gray;
+      sum += weights[pos][7];  // Bias
+
+      pos++;
     }
   }
 
-  return sum;
+  return sum;  // Output in [-1,1]
 }
diff --git a/workspaces/main/shaders/cnn/cnn_layer.wgsl b/workspaces/main/shaders/cnn/cnn_layer.wgsl
index 5834f78..1b1b539 100644
--- a/workspaces/main/shaders/cnn/cnn_layer.wgsl
+++ b/workspaces/main/shaders/cnn/cnn_layer.wgsl
@@ -8,6 +8,7 @@
 #include "common_uniforms"
 #include "cnn_activation"
 #include "cnn_conv3x3"
+#include "cnn_conv5x5"
 #include "cnn_weights_generated"
 
 struct CNNLayerParams {
@@ -29,28 +30,28 @@ struct CNNLayerParams {
 
 @fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
     let uv = p.xy / uniforms.resolution;
-    let input = textureSample(txt, smplr, uv);
-    let original = textureSample(original_input, smplr, uv);
+    let original = (textureSample(original_input, smplr, uv) - 0.5) * 2.0;  // Normalize to [-1,1]
     var result = vec4<f32>(0.0);
 
-    // Layer 0 uses coordinate-aware convolution
+    // Layer 0: 7→4 (RGBD output)
     if (params.layer_index == 0) {
-        result = cnn_conv3x3_with_coord(txt, smplr, uv, uniforms.resolution,
-                                        rgba_weights_layer0, coord_weights_layer0, bias_layer0);
-        result = cnn_tanh(result);
+        result = cnn_conv3x3_7to4_src(txt, smplr, uv, uniforms.resolution, weights_layer0);
+        result = cnn_tanh(result);  // Keep in [-1,1]
     }
     else if (params.layer_index == 1) {
-        result = cnn_conv3x3(txt, smplr, uv, uniforms.resolution,
-                                   weights_layer1, bias_layer1);
-        result = cnn_tanh(result);
-    }
-    else if (params.layer_index == 2) {
-        result = cnn_conv3x3(txt, smplr, uv, uniforms.resolution,
-                                   weights_layer2, bias_layer2);
-    }
-    else {
-        result = input;
+        result = cnn_conv5x5_7to4(txt, smplr, uv, uniforms.resolution,
+                                   original, weights_layer1);
+	result = cnn_tanh(result);  // Keep in [-1,1]
     }
+    else if (params.layer_index == 2) {  // last layer
+        let gray_out = cnn_conv3x3_7to1(txt, smplr, uv, uniforms.resolution,
+                                        original, weights_layer2);
 
-    return mix(original, result, params.blend_amount);
+        // At this point here, 'gray_out' is what the training script should have learned.
+        // Below is some extra code for visual output, excluded from training:
+        result = vec4<f32>(gray_out, gray_out, gray_out, 1.0);  // Keep in [-1,1]
+        let blended = mix(original, result, params.blend_amount);
+        return (blended + 1.0) * 0.5;  // Denormalize to [0,1] for display
+    }
+    return result;
 }
diff --git a/workspaces/main/shaders/cnn/cnn_weights_generated.wgsl b/workspaces/main/shaders/cnn/cnn_weights_generated.wgsl
index 6052ac5..e38669f 100644
--- a/workspaces/main/shaders/cnn/cnn_weights_generated.wgsl
+++ b/workspaces/main/shaders/cnn/cnn_weights_generated.wgsl
@@ -1,185 +1,157 @@
 // Auto-generated CNN weights
 // DO NOT EDIT - Generated by train_cnn.py
 
-const rgba_weights_layer0: array<mat4x4<f32>, 9> = array(
-  mat4x4<f32>(
-    -0.181929, -0.244329, -0.354404, 0.0,
-    -0.291597, -0.195653, 0.081896, 0.0,
-    0.081595, 0.164081, -0.236318, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    0.731888, 0.717648, 0.524081, 0.0,
-    -0.029760, -0.208000, 0.008438, 0.0,
-    0.442082, 0.354681, 0.049288, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.623141, -0.695759, -0.087885, 0.0,
-    0.043135, 0.071979, 0.213065, 0.0,
-    0.011581, 0.110995, 0.034100, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    0.170016, 0.188298, 0.134083, 0.0,
-    -0.222954, -0.088011, 0.015668, 0.0,
-    0.921836, 0.437158, 0.061577, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    1.431940, 1.148113, 1.238067, 0.0,
-    -0.212535, 0.366860, 0.320956, 0.0,
-    0.771192, 0.765570, 0.029189, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    0.171088, 0.000155, 0.212552, 0.0,
-    0.029536, 0.447892, 0.041381, 0.0,
-    0.011807, -0.167281, -0.200702, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.668151, -0.813927, -0.132108, 0.0,
-    -0.156250, 0.179112, -0.069585, 0.0,
-    0.403347, 0.482877, 0.182611, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.609871, -0.768480, -0.590538, 0.0,
-    -0.171854, 0.150167, 0.105694, 0.0,
-    -0.059052, 0.066999, -0.244222, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.112983, -0.066299, 0.117696, 0.0,
-    -0.172541, 0.095008, -0.160754, 0.0,
-    -0.369667, -0.000628, 0.163602, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  )
+const weights_layer0: array<array<f32, 8>, 36> = array(
+  array<f32, 8>(0.074911, 0.143202, 0.086903, 0.070680, -0.031904, 0.122884, 0.191824, 0.071112),
+  array<f32, 8>(0.081964, 0.033505, 0.058371, -0.015971, -0.069732, -0.014956, 0.142861, 0.119666),
+  array<f32, 8>(0.231883, -0.160763, -0.147218, 0.161321, -0.031718, -0.065766, 0.093359, 0.171734),
+  array<f32, 8>(0.082047, 0.288492, 0.121087, 0.001740, -0.104745, -0.071150, 0.031105, 0.037989),
+  array<f32, 8>(0.139236, 0.160690, 0.022091, 0.070994, 0.008793, 0.059247, 0.215077, 0.071112),
+  array<f32, 8>(0.128842, 0.268017, -0.031546, 0.068152, -0.073793, 0.124100, 0.252295, 0.119666),
+  array<f32, 8>(0.077193, -0.080009, -0.160674, 0.101131, -0.152167, -0.035271, 0.067397, 0.171734),
+  array<f32, 8>(-0.073119, 0.204309, 0.005654, 0.101254, -0.063530, -0.040801, 0.213393, 0.037989),
+  array<f32, 8>(-0.024175, 0.018739, 0.095518, 0.096945, 0.088315, 0.079085, -0.069127, 0.071112),
+  array<f32, 8>(0.219014, 0.218505, 0.014228, 0.014379, 0.075954, -0.001065, 0.201142, 0.119666),
+  array<f32, 8>(0.182743, -0.041270, -0.085458, 0.092904, 0.020316, 0.036077, 0.020220, 0.171734),
+  array<f32, 8>(-0.210247, -0.072180, 0.017628, 0.084834, 0.050409, -0.067274, -0.130565, 0.037989),
+  array<f32, 8>(0.071649, -0.072076, -0.109385, -0.012436, 0.041505, -0.013451, -0.068780, 0.071112),
+  array<f32, 8>(0.083389, 0.133852, -0.018137, 0.086250, -0.006205, 0.052853, 0.137369, 0.119666),
+  array<f32, 8>(0.023275, 0.036871, -0.092898, -0.059569, -0.029758, -0.089218, -0.031705, 0.171734),
+  array<f32, 8>(0.054874, 0.290596, 0.157026, -0.127200, 0.054010, -0.163627, 0.185273, 0.037989),
+  array<f32, 8>(0.069455, -0.122527, 0.010922, -0.051404, -0.067941, 0.122001, 0.034784, 0.071112),
+  array<f32, 8>(0.263187, 0.346644, 0.094376, 0.080049, -0.013980, -0.020629, 0.287019, 0.119666),
+  array<f32, 8>(0.078601, -0.045813, 0.048391, 0.107248, -0.001537, 0.003619, 0.040853, 0.171734),
+  array<f32, 8>(-0.052910, 0.333324, -0.028273, 0.111413, 0.059925, 0.054957, 0.257592, 0.037989),
+  array<f32, 8>(0.037894, 0.001266, 0.039858, 0.027731, 0.156182, 0.094188, 0.021791, 0.071112),
+  array<f32, 8>(0.220401, 0.241493, 0.138405, 0.082160, 0.144517, -0.050410, 0.257101, 0.119666),
+  array<f32, 8>(0.055409, -0.103410, 0.049778, -0.023193, -0.116368, -0.085046, 0.047003, 0.171734),
+  array<f32, 8>(0.019721, 0.099621, 0.005697, -0.069641, -0.100712, 0.044279, -0.104894, 0.037989),
+  array<f32, 8>(0.132833, 0.144224, 0.075612, -0.052095, -0.027924, 0.029124, -0.012077, 0.071112),
+  array<f32, 8>(0.146387, 0.098381, 0.131536, 0.034274, -0.073611, 0.080596, 0.124333, 0.119666),
+  array<f32, 8>(0.118243, -0.165692, -0.091107, 0.001822, 0.003771, -0.053877, -0.045592, 0.171734),
+  array<f32, 8>(-0.146034, 0.167379, 0.036433, -0.074485, 0.047772, 0.007719, -0.057026, 0.037989),
+  array<f32, 8>(-0.105517, -0.143677, 0.006013, 0.038752, 0.082525, -0.070290, -0.082964, 0.071112),
+  array<f32, 8>(0.084325, 0.192342, 0.005734, 0.083787, 0.010618, 0.076732, 0.206159, 0.119666),
+  array<f32, 8>(0.025873, -0.002030, -0.008453, 0.189578, 0.077363, 0.014099, 0.086760, 0.171734),
+  array<f32, 8>(-0.040145, 0.209639, 0.131112, 0.021154, -0.046391, -0.055185, 0.110424, 0.037989),
+  array<f32, 8>(-0.091272, -0.149872, -0.018825, 0.109157, 0.037674, -0.067088, -0.199940, 0.071112),
+  array<f32, 8>(0.170814, 0.171591, -0.039657, 0.146638, -0.054918, -0.043451, 0.262821, 0.119666),
+  array<f32, 8>(0.183810, -0.147660, -0.144689, 0.045301, 0.055273, 0.017425, 0.136362, 0.171734),
+  array<f32, 8>(-0.078196, 0.116630, -0.138657, -0.140199, -0.052198, -0.040295, -0.093252, 0.037989)
 );
 
-const coord_weights_layer0 = mat2x4<f32>(
-  0.059076, -0.026617, -0.005155, 0.0,
-  0.135407, -0.090329, 0.058216, 0.0
+const weights_layer1: array<array<f32, 8>, 100> = array(
+  array<f32, 8>(0.016434, 0.032528, 0.014184, -0.048073, 0.017625, 0.025898, 0.035017, -0.024743),
+  array<f32, 8>(-0.086195, 0.041682, 0.071182, -0.062423, -0.016809, -0.004450, -0.035094, 0.087283),
+  array<f32, 8>(-0.070627, 0.033625, 0.025104, -0.086014, -0.037153, -0.019897, 0.046995, -0.025117),
+  array<f32, 8>(-0.042999, 0.043590, -0.107547, 0.114733, -0.006566, 0.067189, 0.042215, -0.019137),
+  array<f32, 8>(-0.105321, 0.188555, -0.033070, 0.005478, -0.019701, -0.006125, -0.006207, -0.024743),
+  array<f32, 8>(-0.018644, 0.021491, 0.042713, 0.047051, 0.009930, -0.074932, 0.016032, 0.087283),
+  array<f32, 8>(-0.036977, 0.022728, -0.031943, -0.134525, -0.024105, 0.022550, 0.038872, -0.025117),
+  array<f32, 8>(-0.017196, 0.102869, -0.028182, 0.153466, -0.024727, 0.008610, -0.029993, -0.019137),
+  array<f32, 8>(-0.135262, 0.264086, 0.052894, 0.104268, -0.044918, 0.085902, 0.119113, -0.024743),
+  array<f32, 8>(0.052648, 0.081481, 0.063582, 0.016832, 0.100333, -0.095727, 0.022089, 0.087283),
+  array<f32, 8>(0.028176, 0.006417, -0.010806, -0.049843, 0.010670, 0.058400, 0.051595, -0.025117),
+  array<f32, 8>(-0.078976, 0.040644, -0.116569, 0.145770, 0.019023, 0.071229, 0.056151, -0.019137),
+  array<f32, 8>(-0.028693, 0.154285, -0.019369, 0.111634, 0.022241, -0.015484, 0.039056, -0.024743),
+  array<f32, 8>(-0.052688, -0.046999, -0.000280, -0.024856, 0.012262, 0.028524, -0.028633, 0.087283),
+  array<f32, 8>(-0.004525, 0.052883, 0.002108, -0.096774, 0.052697, -0.055029, -0.022623, -0.025117),
+  array<f32, 8>(-0.076488, 0.013246, -0.097773, 0.023400, 0.027572, 0.041318, 0.012556, -0.019137),
+  array<f32, 8>(0.028093, 0.007624, 0.021861, -0.079392, 0.053487, 0.065200, -0.084020, -0.024743),
+  array<f32, 8>(-0.027503, 0.010973, 0.077242, 0.105956, 0.003837, -0.032827, 0.062214, 0.087283),
+  array<f32, 8>(0.028159, 0.036260, 0.051032, -0.057339, -0.032511, -0.019800, -0.113611, -0.025117),
+  array<f32, 8>(-0.004438, 0.024692, -0.151404, 0.097579, -0.031042, 0.067771, -0.062624, -0.019137),
+  array<f32, 8>(-0.053284, 0.062195, 0.018403, -0.145339, 0.008091, -0.048359, 0.060338, -0.024743),
+  array<f32, 8>(0.035264, 0.022147, 0.014877, -0.010450, 0.048411, -0.011475, -0.025409, 0.087283),
+  array<f32, 8>(-0.095181, 0.095906, 0.022414, -0.068326, -0.035929, 0.041247, -0.066456, -0.025117),
+  array<f32, 8>(0.011500, 0.097427, -0.072423, 0.068691, 0.006129, 0.025585, -0.066149, -0.019137),
+  array<f32, 8>(0.000253, 0.207033, 0.041903, -0.018208, 0.080300, 0.029738, 0.170740, -0.024743),
+  array<f32, 8>(0.118473, -0.002532, 0.082055, 0.029355, -0.017353, -0.094582, -0.028445, 0.087283),
+  array<f32, 8>(-0.167765, 0.166992, -0.051393, 0.018985, 0.000246, -0.060339, -0.036368, -0.025117),
+  array<f32, 8>(-0.037902, 0.123576, -0.135429, 0.018780, 0.069222, -0.048750, 0.010303, -0.019137),
+  array<f32, 8>(0.092400, 0.317862, 0.056507, 0.269526, 0.015330, -0.078774, 0.213070, -0.024743),
+  array<f32, 8>(0.147994, -0.056838, -0.046159, 0.069406, -0.025076, -0.018648, 0.019698, 0.087283),
+  array<f32, 8>(-0.063516, 0.051390, -0.043280, 0.053602, 0.046148, 0.032013, -0.012079, -0.025117),
+  array<f32, 8>(-0.069387, 0.008554, -0.016392, 0.041428, 0.069626, -0.028865, 0.031068, -0.019137),
+  array<f32, 8>(0.001597, 0.092924, 0.064679, 0.242996, 0.070280, -0.047444, 0.155082, -0.024743),
+  array<f32, 8>(0.003761, -0.067148, 0.020808, -0.009994, 0.064026, -0.023521, -0.061335, 0.087283),
+  array<f32, 8>(0.013300, 0.048670, -0.058611, -0.104133, 0.060389, 0.022588, -0.085768, -0.025117),
+  array<f32, 8>(0.001996, 0.035599, -0.067395, 0.113355, -0.054467, 0.021354, -0.020545, -0.019137),
+  array<f32, 8>(0.024443, 0.016439, 0.095606, -0.006610, 0.056457, 0.009034, 0.048181, -0.024743),
+  array<f32, 8>(-0.081707, 0.089380, 0.012570, 0.040154, 0.006970, -0.097259, -0.003088, 0.087283),
+  array<f32, 8>(0.037347, -0.012520, -0.009110, -0.164514, -0.052337, 0.031441, -0.117828, -0.025117),
+  array<f32, 8>(-0.050695, 0.023007, -0.086370, 0.106721, -0.022698, -0.063039, 0.007639, -0.019137),
+  array<f32, 8>(-0.032690, 0.100637, 0.090612, -0.170336, -0.013709, 0.096891, -0.064632, -0.024743),
+  array<f32, 8>(0.005479, 0.068678, -0.014147, -0.117601, 0.033542, -0.026603, -0.034334, 0.087283),
+  array<f32, 8>(-0.049645, 0.161140, 0.019592, -0.020424, 0.021700, 0.046387, 0.070111, -0.025117),
+  array<f32, 8>(-0.075219, -0.030338, -0.042611, 0.045346, -0.012298, -0.029272, -0.048395, -0.019137),
+  array<f32, 8>(0.110303, 0.091954, 0.026566, -0.013034, -0.001918, 0.025677, -0.003027, -0.024743),
+  array<f32, 8>(0.084352, 0.004527, 0.042981, 0.040333, 0.011019, 0.011699, 0.053396, 0.087283),
+  array<f32, 8>(-0.151306, 0.282692, 0.038388, 0.199704, -0.024410, -0.021070, 0.135509, -0.025117),
+  array<f32, 8>(0.008868, 0.058833, -0.035204, 0.017617, 0.036727, -0.084137, 0.008426, -0.019137),
+  array<f32, 8>(0.111690, 0.202555, 0.002230, 0.104773, 0.043414, 0.094714, 0.024386, -0.024743),
+  array<f32, 8>(0.109470, -0.130369, -0.049615, 0.027567, 0.015618, 0.010219, -0.035927, 0.087283),
+  array<f32, 8>(0.013092, 0.191465, -0.022463, 0.306655, 0.046994, 0.023051, 0.114596, -0.025117),
+  array<f32, 8>(-0.095580, 0.067644, -0.069810, 0.058185, 0.079298, 0.042359, 0.102818, -0.019137),
+  array<f32, 8>(0.163902, 0.060505, 0.020250, 0.151637, -0.041346, 0.079968, -0.066609, -0.024743),
+  array<f32, 8>(0.007401, -0.119463, 0.029195, -0.118251, -0.057537, 0.057136, -0.162722, 0.087283),
+  array<f32, 8>(-0.036401, 0.152383, -0.049404, 0.188484, 0.069434, -0.056077, -0.041920, -0.025117),
+  array<f32, 8>(-0.070811, 0.042628, -0.080224, 0.133910, 0.054912, -0.086587, 0.104432, -0.019137),
+  array<f32, 8>(0.045319, 0.031249, -0.007304, -0.008136, 0.001678, 0.019408, -0.016683, -0.024743),
+  array<f32, 8>(-0.054316, -0.005207, -0.003794, -0.009173, -0.015797, 0.088869, -0.054766, 0.087283),
+  array<f32, 8>(0.036646, 0.049626, -0.038869, -0.049720, 0.012847, -0.054911, -0.012426, -0.025117),
+  array<f32, 8>(-0.002965, 0.087409, -0.027885, 0.089920, 0.013074, -0.106163, 0.065504, -0.019137),
+  array<f32, 8>(-0.004488, 0.102517, 0.092916, -0.079512, 0.001532, -0.048995, -0.041429, -0.024743),
+  array<f32, 8>(-0.062161, -0.027813, 0.037159, -0.030745, -0.017068, 0.084630, -0.046134, 0.087283),
+  array<f32, 8>(-0.017315, 0.191771, -0.050660, -0.140278, 0.038320, 0.037753, -0.043447, -0.025117),
+  array<f32, 8>(-0.079621, 0.091290, -0.098575, 0.055638, 0.007634, -0.051456, -0.011530, -0.019137),
+  array<f32, 8>(-0.044260, 0.010435, 0.104869, -0.029082, 0.038487, 0.004167, 0.020321, -0.024743),
+  array<f32, 8>(0.004107, -0.049898, -0.011912, 0.126974, 0.074958, 0.038876, 0.027066, 0.087283),
+  array<f32, 8>(0.022312, 0.332216, -0.028889, 0.171475, 0.052267, -0.023821, 0.193472, -0.025117),
+  array<f32, 8>(0.009104, -0.027289, -0.016718, 0.092231, 0.023904, -0.034162, 0.004693, -0.019137),
+  array<f32, 8>(0.022922, -0.036846, 0.071670, -0.118853, -0.046374, 0.005972, -0.079006, -0.024743),
+  array<f32, 8>(-0.086613, -0.033065, 0.032719, 0.081925, -0.025818, -0.065103, 0.010425, 0.087283),
+  array<f32, 8>(0.014945, 0.330249, -0.062079, 0.408858, 0.044895, -0.036703, 0.195226, -0.025117),
+  array<f32, 8>(0.021647, 0.086135, -0.013491, 0.027627, -0.033652, -0.016643, -0.037425, -0.019137),
+  array<f32, 8>(-0.028124, 0.039691, 0.108537, -0.123861, -0.071841, -0.034232, 0.009737, -0.024743),
+  array<f32, 8>(-0.095938, -0.080740, 0.047554, -0.145590, -0.041365, 0.031658, -0.027601, 0.087283),
+  array<f32, 8>(-0.050837, 0.179578, 0.020990, 0.240896, -0.038067, 0.007052, 0.036244, -0.025117),
+  array<f32, 8>(-0.100474, 0.012669, -0.123589, 0.147449, -0.056871, 0.029335, -0.041989, -0.019137),
+  array<f32, 8>(0.000809, 0.020182, 0.123381, 0.009990, 0.061892, -0.056804, 0.049866, -0.024743),
+  array<f32, 8>(-0.006123, 0.085572, -0.065080, -0.003607, -0.100605, -0.015746, 0.045932, 0.087283),
+  array<f32, 8>(-0.068945, 0.037700, -0.068738, 0.088604, 0.034364, -0.027429, -0.023157, -0.025117),
+  array<f32, 8>(-0.028689, 0.018089, -0.144344, 0.097751, -0.022261, 0.004934, 0.044538, -0.019137),
+  array<f32, 8>(-0.072695, 0.099329, 0.037965, -0.007148, -0.061809, -0.014461, -0.050644, -0.024743),
+  array<f32, 8>(-0.043364, -0.019908, 0.033602, -0.011686, -0.046646, -0.005387, 0.057703, 0.087283),
+  array<f32, 8>(0.020640, 0.058992, 0.042389, -0.111803, -0.000105, -0.069637, -0.058816, -0.025117),
+  array<f32, 8>(-0.090411, -0.034394, -0.135574, 0.085031, -0.020320, -0.002235, 0.079036, -0.019137),
+  array<f32, 8>(-0.035238, 0.052656, 0.011918, -0.032684, 0.067555, -0.047663, -0.013151, -0.024743),
+  array<f32, 8>(0.077223, 0.067583, -0.053024, 0.063017, -0.023909, -0.041936, 0.039041, 0.087283),
+  array<f32, 8>(-0.011154, 0.253355, 0.006886, 0.066990, -0.018613, -0.033851, 0.022408, -0.025117),
+  array<f32, 8>(-0.042376, 0.097067, -0.107170, 0.053378, 0.081423, -0.059980, -0.019982, -0.019137),
+  array<f32, 8>(-0.086462, 0.042703, 0.052655, -0.129460, -0.073930, -0.004732, -0.089001, -0.024743),
+  array<f32, 8>(0.019294, 0.036932, -0.046783, 0.172396, -0.003345, 0.029704, -0.013067, 0.087283),
+  array<f32, 8>(0.142370, 0.248269, -0.072705, 0.188676, 0.028917, -0.058974, -0.007950, -0.025117),
+  array<f32, 8>(-0.021378, 0.064055, -0.103605, -0.015491, -0.002155, -0.048161, -0.045529, -0.019137),
+  array<f32, 8>(0.006191, 0.063159, 0.005143, -0.101334, -0.020484, 0.038330, 0.010742, -0.024743),
+  array<f32, 8>(-0.123413, 0.027806, -0.063111, 0.060050, -0.087346, 0.080827, 0.016499, 0.087283),
+  array<f32, 8>(0.054552, 0.047349, 0.029259, 0.152502, -0.013689, -0.035447, -0.006584, -0.025117),
+  array<f32, 8>(-0.034984, 0.059972, -0.147872, 0.096835, 0.055766, -0.001973, -0.033631, -0.019137),
+  array<f32, 8>(0.004488, -0.060204, 0.120817, -0.095007, 0.040546, 0.026207, -0.011824, -0.024743),
+  array<f32, 8>(0.000380, 0.102988, 0.010112, -0.011668, 0.004855, -0.019988, -0.035633, 0.087283),
+  array<f32, 8>(0.003894, -0.083172, -0.046051, -0.005485, 0.017347, -0.057191, -0.085077, -0.025117),
+  array<f32, 8>(-0.066185, 0.092341, -0.135679, 0.009092, -0.015954, 0.003226, -0.010182, -0.019137)
 );
 
-const bias_layer0 = vec4<f32>(-0.526177, -0.569862, -1.370040, 0.0);
-
-const weights_layer1: array<mat4x4<f32>, 9> = array(
-  mat4x4<f32>(
-    0.180029, -1.107249, 0.570741, 0.0,
-    -0.098536, 0.079545, -0.083257, 0.0,
-    -0.020066, 0.333084, 0.039506, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    3.068946, -1.783570, -0.550517, 0.0,
-    -0.296369, -0.080958, 0.040260, 0.0,
-    -0.093713, -0.212577, -0.110011, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    2.282564, -0.538192, -0.793214, 0.0,
-    -0.395788, 0.130881, 0.078571, 0.0,
-    -0.041375, 0.061666, 0.045651, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.267284, -1.971639, -0.099616, 0.0,
-    -0.084432, 0.139794, 0.007091, 0.0,
-    -0.103042, -0.104340, 0.067299, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -5.233469, -2.252747, -3.555217, 0.0,
-    0.647940, -0.178858, 0.351633, 0.0,
-    -0.014237, -0.505881, 0.165940, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.121700, -0.677386, -2.435040, 0.0,
-    0.084806, -0.028000, 0.380387, 0.0,
-    -0.020906, -0.279161, 0.041915, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    2.982562, -0.298441, -0.147775, 0.0,
-    -0.291832, 0.102875, -0.128590, 0.0,
-    -0.091786, 0.104389, -0.188678, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -4.434978, -0.261830, -2.436411, 0.0,
-    0.349188, -0.245908, 0.272592, 0.0,
-    0.010322, -0.148525, -0.031531, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    0.129886, 1.516168, -0.755576, 0.0,
-    0.133138, -0.260276, 0.028059, 0.0,
-    0.001185, 0.141547, -0.003606, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  )
+const weights_layer2: array<array<f32, 8>, 9> = array(
+  array<f32, 8>(0.071600, -0.118269, 0.093769, 0.096974, -0.002193, -0.065924, -0.125094, 0.018248),
+  array<f32, 8>(-0.089131, -0.053007, 0.150626, -0.051485, 0.087371, -0.078030, -0.045468, 0.018248),
+  array<f32, 8>(0.042144, 0.146191, 0.152445, 0.028572, 0.064491, -0.061860, 0.037828, 0.018248),
+  array<f32, 8>(-0.084747, -0.133062, -0.030736, 0.061174, -0.055809, -0.012031, 0.126923, 0.018248),
+  array<f32, 8>(-0.017155, -0.105189, 0.003457, 0.105491, 0.003587, 0.089110, -0.001623, 0.018248),
+  array<f32, 8>(-0.028012, -0.066691, 0.125358, -0.027705, 0.032134, 0.044475, -0.036991, 0.018248),
+  array<f32, 8>(0.094536, -0.038367, -0.009421, 0.027049, -0.103427, -0.065209, -0.110071, 0.018248),
+  array<f32, 8>(0.147956, 0.028446, 0.031066, 0.055667, -0.039952, 0.069251, 0.020060, 0.018248),
+  array<f32, 8>(0.067507, 0.154407, -0.017526, 0.064009, -0.014328, 0.022175, 0.015376, 0.018248)
 );
 
-const bias_layer1 = vec4<f32>(1.367986, -1.148709, -0.650040, 0.0);
-
-const weights_layer2: array<mat4x4<f32>, 9> = array(
-  mat4x4<f32>(
-    -0.137003, -0.289376, 0.625000, 0.0,
-    -0.120120, -0.238968, 0.448432, 0.0,
-    -0.142094, -0.253706, 0.458181, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.337017, -0.757585, 0.135953, 0.0,
-    -0.304432, -0.553491, 0.419907, 0.0,
-    -0.313585, -0.467667, 0.615326, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.161089, -0.328735, 0.612679, 0.0,
-    -0.137144, -0.172882, 0.176362, 0.0,
-    -0.153195, -0.061571, 0.173977, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.227814, -0.544193, -0.564658, 0.0,
-    -0.211743, -0.430586, 0.080349, 0.0,
-    -0.214442, -0.417501, 0.880266, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.435370, -0.295169, -0.865976, 0.0,
-    -0.423147, -0.274780, 0.323049, 0.0,
-    -0.411180, -0.062517, 1.099769, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.199573, -0.488030, -0.396440, 0.0,
-    -0.187844, -0.360516, -0.156646, 0.0,
-    -0.188681, -0.292304, -0.134645, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.123218, -0.287990, 0.154656, 0.0,
-    -0.112954, -0.282778, 0.498742, 0.0,
-    -0.139083, -0.319337, 1.112621, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.267477, -0.691374, -0.028960, 0.0,
-    -0.246348, -0.585583, 0.401194, 0.0,
-    -0.253279, -0.562875, 1.105818, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  ),
-  mat4x4<f32>(
-    -0.083133, -0.131627, 0.460039, 0.0,
-    -0.071126, -0.108601, 0.163545, 0.0,
-    -0.092579, -0.110020, 0.131282, 0.0,
-    0.0, 0.0, 0.0, 0.0,
-  )
-);
-
-const bias_layer2 = vec4<f32>(-1.805686, -0.798340, 0.462318, 0.0);
-
diff --git a/workspaces/main/shaders/scene1.wgsl b/workspaces/main/shaders/scene1.wgsl
new file mode 100644
index 0000000..7af3811
--- /dev/null
+++ b/workspaces/main/shaders/scene1.wgsl
@@ -0,0 +1,258 @@
+// Scene1 effect shader - ShaderToy conversion (raymarching cube & sphere)
+// Source: Saturday cubism experiment by skal
+
+#include "common_uniforms"
+
+@group(0) @binding(0) var<uniform> uniforms: CommonUniforms;
+
+const PI: f32 = 3.141592654;
+const TAU: f32 = 6.283185307;
+const TOLERANCE: f32 = 0.0005;
+const MAX_RAY_LENGTH: f32 = 20.0;
+const MAX_RAY_MARCHES: i32 = 80;
+const MAX_SHD_MARCHES: i32 = 20;
+const NORM_OFF: f32 = 0.005;
+
+fn rot(a: f32) -> mat2x2<f32> {
+  let c = cos(a);
+  let s = sin(a);
+  return mat2x2<f32>(c, s, -s, c);
+}
+
+// HSV to RGB conversion
+const hsv2rgb_K = vec4<f32>(1.0, 2.0 / 3.0, 1.0 / 3.0, 3.0);
+fn hsv2rgb(c: vec3<f32>) -> vec3<f32> {
+  let p = abs(fract(c.xxx + hsv2rgb_K.xyz) * 6.0 - hsv2rgb_K.www);
+  return c.z * mix(hsv2rgb_K.xxx, clamp(p - hsv2rgb_K.xxx, vec3<f32>(0.0), vec3<f32>(1.0)), c.y);
+}
+
+// Colors (precomputed HSV conversions)
+const skyCol = vec3<f32>(0.176, 0.235, 0.25); // HSV(0.57, 0.90, 0.25)
+const skylineCol = vec3<f32>(0.5, 0.125, 0.025); // HSV(0.02, 0.95, 0.5)
+const sunCol = vec3<f32>(0.5, 0.163, 0.025); // HSV(0.07, 0.95, 0.5)
+const diffCol1 = vec3<f32>(0.4, 1.0, 1.0); // HSV(0.60, 0.90, 1.0)
+const diffCol2 = vec3<f32>(0.325, 1.0, 0.975); // HSV(0.55, 0.90, 1.0)
+
+// Lighting (normalized manually)
+const sunDir1 = vec3<f32>(0.0, 0.04997, -0.99875); // normalize(0, 0.05, -1)
+const lightPos1 = vec3<f32>(10.0, 10.0, 10.0);
+const lightPos2 = vec3<f32>(-10.0, 10.0, -10.0);
+
+fn sRGB(t: vec3<f32>) -> vec3<f32> {
+  return mix(1.055 * pow(t, vec3<f32>(1.0/2.4)) - 0.055, 12.92 * t, step(t, vec3<f32>(0.0031308)));
+}
+
+fn aces_approx(v_in: vec3<f32>) -> vec3<f32> {
+  var v = max(v_in, vec3<f32>(0.0));
+  v *= 0.6;
+  let a = 2.51;
+  let b = 0.03;
+  let c = 2.43;
+  let d = 0.59;
+  let e = 0.14;
+  return clamp((v * (a * v + b)) / (v * (c * v + d) + e), vec3<f32>(0.0), vec3<f32>(1.0));
+}
+
+fn tanh_approx(x: f32) -> f32 {
+  let x2 = x * x;
+  return clamp(x * (27.0 + x2) / (27.0 + 9.0 * x2), -1.0, 1.0);
+}
+
+fn rayPlane(ro: vec3<f32>, rd: vec3<f32>, plane: vec4<f32>) -> f32 {
+  return -(dot(ro, plane.xyz) + plane.w) / dot(rd, plane.xyz);
+}
+
+fn box2d(p: vec2<f32>, b: vec2<f32>) -> f32 {
+  let d = abs(p) - b;
+  return length(max(d, vec2<f32>(0.0))) + min(max(d.x, d.y), 0.0);
+}
+
+fn box3d(p: vec3<f32>, b: vec3<f32>) -> f32 {
+  let q = abs(p) - b;
+  return length(max(q, vec3<f32>(0.0))) + min(max(q.x, max(q.y, q.z)), 0.0);
+}
+
+fn sphere(p: vec3<f32>, r: f32) -> f32 {
+  return length(p) - r;
+}
+
+var<private> g_rot0: mat2x2<f32>;
+
+fn render0(ro: vec3<f32>, rd: vec3<f32>) -> vec3<f32> {
+  var col = vec3<f32>(0.0);
+  var sf = 1.0001 - max(dot(sunDir1, rd), 0.0);
+  col += skyCol * pow((1.0 - abs(rd.y)), 8.0);
+  col += clamp(vec3<f32>(mix(0.0025, 0.125, tanh_approx(0.005 / sf)) / abs(rd.y)) * skylineCol, vec3<f32>(0.0), vec3<f32>(10.0));
+  sf *= sf;
+  col += sunCol * 0.00005 / sf;
+
+  let tp1 = rayPlane(ro, rd, vec4<f32>(0.0, -1.0, 0.0, 6.0));
+  if (tp1 > 0.0) {
+    let pos = ro + tp1 * rd;
+    let pp = pos.xz;
+    let db = box2d(pp, vec2<f32>(5.0, 9.0)) - 3.0;
+    col += vec3<f32>(4.0) * skyCol * rd.y * rd.y * smoothstep(0.25, 0.0, db);
+    col += vec3<f32>(0.8) * skyCol * exp(-0.5 * max(db, 0.0));
+  }
+
+  return clamp(col, vec3<f32>(0.0), vec3<f32>(10.0));
+}
+
+fn df(p_in: vec3<f32>) -> f32 {
+  var p = p_in;
+  p.x = p_in.x * g_rot0[0][0] + p_in.z * g_rot0[0][1];
+  p.z = p_in.x * g_rot0[1][0] + p_in.z * g_rot0[1][1];
+
+  // Cube
+  var pc = p;
+  pc -= vec3<f32>(-1.9, 0.0, 0.0);
+  let dCube = box3d(pc, vec3<f32>(1.6));
+
+  // Sphere
+  var ps = p;
+  ps -= vec3<f32>(1.3, 0.0, 0.0);
+  let dSphere = sphere(ps, 1.2);
+
+  // Ground plane
+  let dPlane = p.y + 1.0;
+
+  // Union
+  var d = min(dCube, dSphere);
+  d = min(d, dPlane);
+
+  return d;
+}
+
+fn normal(pos: vec3<f32>) -> vec3<f32> {
+  let eps = vec2<f32>(NORM_OFF, 0.0);
+  var nor: vec3<f32>;
+  nor.x = df(pos + eps.xyy) - df(pos - eps.xyy);
+  nor.y = df(pos + eps.yxy) - df(pos - eps.yxy);
+  nor.z = df(pos + eps.yyx) - df(pos - eps.yyx);
+  return normalize(nor);
+}
+
+fn rayMarch(ro: vec3<f32>, rd: vec3<f32>, initt: f32) -> f32 {
+  var t = initt;
+  for (var i = 0; i < MAX_RAY_MARCHES; i++) {
+    if (t > MAX_RAY_LENGTH) {
+      t = MAX_RAY_LENGTH;
+      break;
+    }
+    let d = df(ro + rd * t);
+    if (d < TOLERANCE) {
+      break;
+    }
+    t += d;
+  }
+  return t;
+}
+
+fn shadow(lp: vec3<f32>, ld: vec3<f32>, mint: f32, maxt: f32) -> f32 {
+  let ds = 1.0 - 0.4;
+  var t = mint;
+  var nd = 1e6;
+  let soff = 0.05;
+  let smul = 1.5;
+  for (var i = 0; i < MAX_SHD_MARCHES; i++) {
+    let p = lp + ld * t;
+    let d = df(p);
+    if (d < TOLERANCE || t >= maxt) {
+      let sd = 1.0 - exp(-smul * max(t / maxt - soff, 0.0));
+      return select(mix(sd, 1.0, smoothstep(0.0, 0.025, nd)), sd, t >= maxt);
+    }
+    nd = min(nd, d);
+    t += ds * d;
+  }
+  let sd = 1.0 - exp(-smul * max(t / maxt - soff, 0.0));
+  return sd;
+}
+
+fn boxCol(col: vec3<f32>, nsp: vec3<f32>, rd: vec3<f32>, nnor: vec3<f32>, nrcol: vec3<f32>, nshd1: f32, nshd2: f32) -> vec3<f32> {
+  var nfre = 1.0 + dot(rd, nnor);
+  nfre *= nfre;
+
+  let nld1 = normalize(lightPos1 - nsp);
+  let nld2 = normalize(lightPos2 - nsp);
+
+  var ndif1 = max(dot(nld1, nnor), 0.0);
+  ndif1 *= ndif1;
+
+  var ndif2 = max(dot(nld2, nnor), 0.0);
+  ndif2 *= ndif2;
+
+  var scol = vec3<f32>(0.0);
+  let rf = smoothstep(1.0, 0.9, nfre);
+  scol += diffCol1 * ndif1 * nshd1;
+  scol += diffCol2 * ndif2 * nshd2;
+  scol += 0.1 * (skyCol + skylineCol);
+  scol += nrcol * 0.75 * mix(vec3<f32>(0.25), vec3<f32>(0.5, 0.5, 1.0), nfre);
+
+  return mix(col, scol, rf * smoothstep(90.0, 20.0, dot(nsp, nsp)));
+}
+
+fn render1(ro: vec3<f32>, rd: vec3<f32>) -> vec3<f32> {
+  let skyCol_local = render0(ro, rd);
+  var col = skyCol_local;
+
+  let nt = rayMarch(ro, rd, 0.0);
+  if (nt < MAX_RAY_LENGTH) {
+    let nsp = ro + rd * nt;
+    let nnor = normal(nsp);
+
+    let nref = reflect(rd, nnor);
+    let nrt = rayMarch(nsp, nref, 0.2);
+    var nrcol = render0(nsp, nref);
+
+    if (nrt < MAX_RAY_LENGTH) {
+      let nrsp = nsp + nref * nrt;
+      let nrnor = normal(nrsp);
+      let nrref = reflect(nref, nrnor);
+      nrcol = boxCol(nrcol, nrsp, nref, nrnor, render0(nrsp, nrref), 1.0, 1.0);
+    }
+
+    let nshd1 = mix(0.0, 1.0, shadow(nsp, normalize(lightPos1 - nsp), 0.1, distance(lightPos1, nsp)));
+    let nshd2 = mix(0.0, 1.0, shadow(nsp, normalize(lightPos2 - nsp), 0.1, distance(lightPos2, nsp)));
+
+    col = boxCol(col, nsp, rd, nnor, nrcol, nshd1, nshd2);
+  }
+
+  return col;
+}
+
+fn effect(p: vec2<f32>) -> vec3<f32> {
+  g_rot0 = rot(-0.2 * uniforms.time);
+
+  let fov = tan(TAU / 6.0);
+  let ro = vec3<f32>(0.0, 2.5, 5.0);
+  let la = vec3<f32>(0.0, 0.0, 0.0);
+  let up = vec3<f32>(0.1, 1.0, 0.0);
+
+  let ww = normalize(la - ro);
+  let uu = normalize(cross(up, ww));
+  let vv = cross(ww, uu);
+  let rd = normalize(-p.x * uu + p.y * vv + fov * ww);
+
+  return render1(ro, rd);
+}
+
+@vertex fn vs_main(@builtin(vertex_index) i: u32) -> @builtin(position) vec4<f32> {
+  var pos = array<vec2<f32>, 3>(
+    vec2<f32>(-1.0, -1.0),
+    vec2<f32>(3.0, -1.0),
+    vec2<f32>(-1.0, 3.0)
+  );
+  return vec4<f32>(pos[i], 0.0, 1.0);
+}
+
+@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
+  // Flip Y to match ShaderToy convention (origin at bottom-left)
+  let flipped = vec2<f32>(p.x, uniforms.resolution.y - p.y);
+  let q = flipped / uniforms.resolution;
+  var coord = -1.0 + 2.0 * q;
+  coord.x *= uniforms.resolution.x / uniforms.resolution.y;
+  var col = effect(coord);
+  col = aces_approx(col);
+  col = sRGB(col);
+  return vec4<f32>(col, 1.0);
+}
diff --git a/workspaces/main/timeline.seq b/workspaces/main/timeline.seq
index 8f7eea6..42d81a0 100644
--- a/workspaces/main/timeline.seq
+++ b/workspaces/main/timeline.seq
@@ -36,8 +36,9 @@ SEQUENCE 8.50 2 "Hybrid3D"
 SEQUENCE 10.50 0 "CNN effect"
   EFFECT + HeptagonEffect 0.0 12.00
 #  EFFECT + RotatingCubeEffect 0.00 12.0
-  EFFECT + Hybrid3DEffect 0.00 12.00
-  EFFECT + CNNEffect 1.0 12.0 layers=3 blend=1.5
+#  EFFECT + Hybrid3DEffect 0.00 12.00
+  EFFECT + Scene1Effect 0.0 12.0
+  EFFECT + CNNEffect 1.0 12.0 layers=3 blend=.5
 
 SEQUENCE 22.0 0 "buggy"
   EFFECT + HeptagonEffect 0.00 0.20