From 2f8810f303d06fe78dbec343553c3c97f93f9323 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Sun, 15 Feb 2026 18:55:42 +0100
Subject: refactor(cnn): rename cnn_effect to cnn_v1_effect for clarity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Renamed files and classes:
- cnn_effect.{h,cc} → cnn_v1_effect.{h,cc}
- CNNEffect → CNNv1Effect
- CNNEffectParams → CNNv1EffectParams
- CNNLayerParams → CNNv1LayerParams
- CNN_EFFECT.md → CNN_V1_EFFECT.md

Updated all references:
- C++ includes and class usage
- CMake source list
- Timeline (workspaces/main/timeline.seq)
- Test file (test_demo_effects.cc)
- Documentation (CLAUDE.md, PROJECT_CONTEXT.md, READMEs)

Tests: 34/34 passing (100%)
---
 cnn_v1/README.md                    |   2 +-
 cnn_v1/docs/CNN.md                  |   2 +-
 cnn_v1/docs/CNN_EFFECT.md           | 400 ------------------------------------
 cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md |   6 +-
 cnn_v1/docs/CNN_V1_EFFECT.md        | 400 ++++++++++++++++++++++++++++++++++++
 cnn_v1/src/cnn_effect.cc            | 129 ------------
 cnn_v1/src/cnn_effect.h             |  53 -----
 cnn_v1/src/cnn_v1_effect.cc         | 129 ++++++++++++
 cnn_v1/src/cnn_v1_effect.h          |  53 +++++
 9 files changed, 587 insertions(+), 587 deletions(-)
 delete mode 100644 cnn_v1/docs/CNN_EFFECT.md
 create mode 100644 cnn_v1/docs/CNN_V1_EFFECT.md
 delete mode 100644 cnn_v1/src/cnn_effect.cc
 delete mode 100644 cnn_v1/src/cnn_effect.h
 create mode 100644 cnn_v1/src/cnn_v1_effect.cc
 create mode 100644 cnn_v1/src/cnn_v1_effect.h

(limited to 'cnn_v1')
diff --git a/cnn_v1/README.md b/cnn_v1/README.md
index ad9b8f3..052f22a 100644
--- a/cnn_v1/README.md
+++ b/cnn_v1/README.md
@@ -29,7 +29,7 @@ Original CNN implementation with per-layer WGSL shaders. Supports multiple kerne
 ## Documentation
 
 - [CNN.md](docs/CNN.md) - Architecture overview
-- [CNN_EFFECT.md](docs/CNN_EFFECT.md) - Implementation details
+- [CNN_V1_EFFECT.md](docs/CNN_V1_EFFECT.md) - Implementation details
 - [CNN_TEST_TOOL.md](docs/CNN_TEST_TOOL.md) - Testing guide
 - [CNN_DEBUG.md](docs/CNN_DEBUG.md) - Debugging notes
 
diff --git a/cnn_v1/docs/CNN.md b/cnn_v1/docs/CNN.md
index 2dc3362..5d9a667 100644
--- a/cnn_v1/docs/CNN.md
+++ b/cnn_v1/docs/CNN.md
@@ -8,7 +8,7 @@ Have the input 3d scene be processed by a multi-layer CNN trained on the side.
 Input: some rendered scene.
 Output: 'stylized' scene with CNN post-processing.
 
-**See `doc/CNN_EFFECT.md` for implementation details, usage, and API reference.**
+**See `CNN_V1_EFFECT.md` for implementation details, usage, and API reference.**
 
 ## Shader implementation
 
diff --git a/cnn_v1/docs/CNN_EFFECT.md b/cnn_v1/docs/CNN_EFFECT.md
deleted file mode 100644
index 40f095e..0000000
--- a/cnn_v1/docs/CNN_EFFECT.md
+++ /dev/null
@@ -1,400 +0,0 @@
-# CNN Post-Processing Effect
-
-Neural network-based stylization for rendered scenes.
-
----
-
-## Overview
-
-Trainable convolutional neural network layers for artistic stylization (painterly, sketch, cel-shaded effects) with minimal runtime overhead.
-
-**Key Features:**
-- Position-aware layer 0 (coordinate input for vignetting, edge effects)
-- Multi-layer convolutions (3×3, 5×5, 7×7 kernels) with automatic chaining
-- Original input available to all layers via framebuffer capture
-- Configurable final blend with original scene
-- Modular WGSL shader architecture
-- Hardcoded weights (trained offline via PyTorch)
-- ~5-8 KB binary footprint
-
----
-
-## Architecture
-
-### RGBD → Grayscale Pipeline
-
-**Input:** RGBD (RGB + inverse depth D=1/z)
-**Output:** Grayscale (1 channel)
-**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1]
-
-**Architecture:**
-- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD
-- **Final layer (N-1):** Conv2d(7→1) - output grayscale
-
-```wgsl
-// Inner layers: 7→4 (RGBD output, vec4-optimized)
-fn cnn_conv3x3_7to4(
-  tex: texture_2d<f32>,
-  samp: sampler,
-  uv: vec2<f32>,
-  resolution: vec2<f32>,
-  gray: f32,                               # Grayscale [-1,1]
-  weights: array<vec4<f32>, 72>           # 9 pos × 4 ch × 2 vec4 (8 floats per filter)
-) -> vec4<f32>
-
-// Final layer: 7→1 (grayscale output, vec4-optimized)
-fn cnn_conv3x3_7to1(
-  tex: texture_2d<f32>,
-  samp: sampler,
-  uv: vec2<f32>,
-  resolution: vec2<f32>,
-  gray: f32,
-  weights: array<vec4<f32>, 18>           # 9 pos × 2 vec4 (8 floats per filter)
-) -> f32
-```
-
-**Input normalization:**
-- **fs_main** normalizes textures once: `(tex - 0.5) * 2` → [-1,1]
-- **Conv functions** normalize UV coords: `(uv - 0.5) * 2` → [-1,1]
-- **Grayscale** computed once in fs_main using dot product: `dot(original.rgb, vec3(0.2126, 0.7152, 0.0722))`
-- **Inter-layer data** stays in [-1,1] (no denormalization)
-- **Final output** denormalized for display: `(result + 1.0) * 0.5` → [0,1]
-
-**Activation:** tanh for inner layers (output stays [-1,1]), none for final layer
-
-### Multi-Layer Architecture
-
-CNNEffect supports multi-layer networks via automatic effect chaining:
-
-1. **Timeline specifies total layers**: `CNNEffect layers=3 blend=0.7`
-2. **Compiler expands to chain**: 3 separate CNNEffect instances (layer 0→1→2)
-3. **Framebuffer capture**: Layer 0 captures original input to `"captured_frame"`
-4. **Original input binding**: All layers access original via `@binding(4)`
-5. **Final blend**: Last layer blends result with original: `mix(original, result, 0.7)`
-
-**Framebuffer Capture API:**
-- `Effect::needs_framebuffer_capture()` - effect requests pre-capture
-- MainSequence automatically blits input → `"captured_frame"` auxiliary texture
-- Generic mechanism usable by any effect
-
-### File Structure
-
-```
-src/effects/
-  cnn_effect.h/cc         # CNNEffect class + framebuffer capture
-
-workspaces/main/shaders/cnn/
-  cnn_activation.wgsl     # tanh, ReLU, sigmoid, leaky_relu
-  cnn_conv3x3.wgsl        # 3×3 convolution (standard + coord-aware)
-  cnn_conv5x5.wgsl        # 5×5 convolution (standard + coord-aware)
-  cnn_conv7x7.wgsl        # 7×7 convolution (standard + coord-aware)
-  cnn_weights_generated.wgsl  # Weight arrays (auto-generated by train_cnn.py)
-  cnn_layer.wgsl          # Main shader with layer switches (auto-generated by train_cnn.py)
-```
-
----
-
-## Training Workflow
-
-### 1. Prepare Training Data
-
-Input/target image pairs:
-```
-training/input/img_000.png   # RGBA (RGB + alpha)
-training/output/img_000.png  # Grayscale target
-```
-
-**Note:** Alpha channel can be depth (1/z) or constant (255). Network learns from RGB primarily.
-
-### 2. Train Network
-
-**Patch-based (Recommended)** - Preserves natural pixel scale:
-```bash
-python3 training/train_cnn.py \
-  --input training/input --target training/output \
-  --patch-size 32 --patches-per-image 64 --detector harris \
-  --layers 3 --kernel-sizes 3,5,3 \
-  --epochs 5000 --batch-size 16 --checkpoint-every 1000
-```
-
-**Detectors:** `harris` (corners), `fast` (features), `shi-tomasi` (corners), `gradient` (edges)
-
-**Full-image (Legacy)** - Resizes to 256×256:
-```bash
-python3 training/train_cnn.py \
-  --input training/input --target training/output \
-  --layers 3 --kernel-sizes 3,5,3 \
-  --epochs 10000 --batch-size 8 --checkpoint-every 1000
-```
-
-**Auto-generates:**
-- `cnn_weights_generated.wgsl` - Weight arrays
-- `cnn_layer.wgsl` - Layer shader
-
-### 3. Export & Validate
-
-```bash
-# Export shaders
-./training/train_cnn.py --export-only checkpoints/checkpoint_epoch_5000.pth
-
-# Generate ground truth
-./training/train_cnn.py --infer input.png \
-  --export-only checkpoints/checkpoint_epoch_5000.pth --output ground_truth.png
-```
-
-### 4. Rebuild Demo
-
-```bash
-cmake --build build -j4 && ./build/demo64k
-```
-
----
-
-## Usage
-
-### C++ Integration
-
-**Single layer (manual):**
-```cpp
-#include "effects/cnn_effect.h"
-
-CNNEffectParams p;
-p.layer_index = 0;
-p.total_layers = 1;
-p.blend_amount = 1.0f;
-auto cnn = std::make_shared<CNNEffect>(ctx, p);
-timeline.add_effect(cnn, start_time, end_time);
-```
-
-**Multi-layer (automatic via timeline compiler):**
-
-Use timeline syntax - `seq_compiler` expands to multiple instances.
-
-### Timeline Examples
-
-**Single-layer CNN (full stylization):**
-```
-SEQUENCE 10.0 0
-  EFFECT + Hybrid3DEffect 0.00 5.00
-  EFFECT + CNNEffect 0.50 5.00 layers=1
-```
-
-**Multi-layer CNN with blend:**
-```
-SEQUENCE 10.0 0
-  EFFECT + Hybrid3DEffect 0.00 5.00
-  EFFECT + CNNEffect 0.50 5.00 layers=3 blend=0.7
-```
-
-Expands to:
-```cpp
-// Layer 0 (captures original, blend=1.0)
-{
-  CNNEffectParams p;
-  p.layer_index = 0;
-  p.total_layers = 3;
-  p.blend_amount = 1.0f;
-  seq->add_effect(std::make_shared<CNNEffect>(ctx, p), 0.50f, 5.00f, 1);
-}
-// Layer 1 (blend=1.0)
-{
-  CNNEffectParams p;
-  p.layer_index = 1;
-  p.total_layers = 3;
-  p.blend_amount = 1.0f;
-  seq->add_effect(std::make_shared<CNNEffect>(ctx, p), 0.50f, 5.00f, 2);
-}
-// Layer 2 (final blend=0.7)
-{
-  CNNEffectParams p;
-  p.layer_index = 2;
-  p.total_layers = 3;
-  p.blend_amount = 0.7f;
-  seq->add_effect(std::make_shared<CNNEffect>(ctx, p), 0.50f, 5.00f, 3);
-}
-```
-
----
-
-## Shader Structure
-
-**Bindings:**
-```wgsl
-@group(0) @binding(0) var smplr: sampler;
-@group(0) @binding(1) var txt: texture_2d<f32>;              // Current layer input
-@group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
-@group(0) @binding(3) var<uniform> params: CNNLayerParams;
-@group(0) @binding(4) var original_input: texture_2d<f32>;   // Layer 0 input (captured)
-```
-
-**Fragment shader logic:**
-```wgsl
-@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
-    let uv = p.xy / uniforms.resolution;
-    let original_raw = textureSample(original_input, smplr, uv);
-    let original = (original_raw - 0.5) * 2.0;  // Normalize to [-1,1]
-    let gray = dot(original.rgb, vec3<f32>(0.2126, 0.7152, 0.0722));
-    var result = vec4<f32>(0.0);
-
-    if (params.layer_index == 0) {
-        result = cnn_conv3x3_7to4_src(txt, smplr, uv, uniforms.resolution,
-                                      weights_layer0);
-        result = cnn_tanh(result);
-    }
-    else if (params.layer_index == 1) {
-        result = cnn_conv5x5_7to4(txt, smplr, uv, uniforms.resolution,
-                                   gray, weights_layer1);
-        result = cnn_tanh(result);
-    }
-    // ... other layers
-
-    // Blend with ORIGINAL input (not previous layer)
-    return mix(original_raw, result, params.blend_amount);
-}
-```
-
-**Weight Storage (vec4-optimized):**
-
-**Inner layers (7→4 RGBD output):**
-```wgsl
-// Structure: array<vec4<f32>, 72>
-// 9 pos × 4 ch × 2 vec4 (8 floats per filter: [rgba][uv,gray,1])
-const weights_layer0: array<vec4<f32>, 72> = array(
-  vec4<f32>(w0_r, w0_g, w0_b, w0_d),        // pos0_ch0 (rgba weights)
-  vec4<f32>(w0_u, w0_v, w0_gray, bias0),    // pos0_ch0 (uv, gray, bias)
-  vec4<f32>(w1_r, w1_g, w1_b, w1_d),        // pos0_ch1 (rgba weights)
-  vec4<f32>(w1_u, w1_v, w1_gray, bias1),    // pos0_ch1 (uv, gray, bias)
-  // ... 68 more vec4s
-);
-```
-
-**Final layer (7→1 grayscale output):**
-```wgsl
-// Structure: array<vec4<f32>, 18>
-// 9 pos × 2 vec4 (8 floats per filter: [rgba][uv,gray,1])
-const weights_layerN: array<vec4<f32>, 18> = array(
-  vec4<f32>(w0_r, w0_g, w0_b, w0_d),        // pos0 (rgba weights)
-  vec4<f32>(w0_u, w0_v, w0_gray, bias0),    // pos0 (uv, gray, bias)
-  // ... 16 more vec4s
-);
-```
-
-**Optimization:** Bias integrated as 4th component via `vec4(uv, gray, 1.0)` input. Two dot4 operations replace 8 scalar MADs.
-
----
-
-## Size Budget
-
-| Component | Size | Notes |
-|-----------|------|-------|
-| Activation functions | ~200 B | 4 functions |
-| Conv3x3 (standard + coord) | ~500 B | Both variants |
-| Conv5x5 (standard + coord) | ~700 B | Both variants |
-| Conv7x7 (standard + coord) | ~900 B | Both variants |
-| Main shader | ~800 B | Layer composition |
-| C++ implementation | ~300 B | Effect class |
-| **Coord weights** | **+32 B** | Per-layer overhead (layer 0 only) |
-| **RGBA weights** | **2-6 KB** | Depends on depth/kernel sizes |
-| **Total** | **5-9 KB** | Acceptable for 64k |
-
-**Optimization strategies:**
-- Quantize weights (float32 → int8)
-- Prune near-zero weights
-- Use separable convolutions
-
----
-
-## Testing
-
-```bash
-./build/test_demo_effects  # CNN construction/shader tests
-./build/demo64k            # Visual test
-```
-
----
-
-## Blend Parameter Behavior
-
-**blend_amount** controls final compositing with original:
-- `blend=0.0`: Pure original (no CNN effect)
-- `blend=0.5`: 50% original + 50% CNN
-- `blend=1.0`: Pure CNN output (full stylization)
-
-**Important:** Blend uses captured layer 0 input, not previous layer output.
-
-**Example use cases:**
-- `blend=1.0`: Full stylization (default)
-- `blend=0.7`: Subtle effect preserving original details
-- `blend=0.3`: Light artistic touch
-
-## Troubleshooting
-
-**Shader compilation fails:**
-- Check `cnn_weights_generated.wgsl` syntax
-- Verify snippets registered in `shaders.cc::InitShaderComposer()`
-- Ensure `cnn_layer.wgsl` has 5 bindings (including `original_input`)
-
-**Black/corrupted output:**
-- Weights untrained (identity placeholder)
-- Check `captured_frame` auxiliary texture is registered
-- Verify layer priorities in timeline are sequential
-
-**Wrong blend result:**
-- Ensure layer 0 has `needs_framebuffer_capture() == true`
-- Check MainSequence framebuffer capture logic
-- Verify `original_input` binding is populated
-
-**Training loss not decreasing:**
-- Lower learning rate (`--learning-rate 0.0001`)
-- More epochs (`--epochs 1000`)
-- Check input/target image alignment
-
----
-
-## Vec4 Optimization
-
-**Architecture:** Weights stored as vec4 pairs for SIMD efficiency.
-
-**Input representation:**
-```wgsl
-let rgbd = textureSample(...);              // vec4: [r, g, b, d]
-let in1 = vec4<f32>(uv_norm, gray, 1.0);   // vec4: [u, v, gray, 1.0]
-```
-
-**Weight indexing:**
-```wgsl
-var pos = 0;  // Direct weight array index
-for (var dy = -1; dy <= 1; dy++) {
-  for (var dx = -1; dx <= 1; dx++) {
-    // Unrolled channel loop (4 output channels)
-    sum.r += dot(weights[pos+0], rgbd) + dot(weights[pos+1], in1);
-    sum.g += dot(weights[pos+2], rgbd) + dot(weights[pos+3], in1);
-    sum.b += dot(weights[pos+4], rgbd) + dot(weights[pos+5], in1);
-    sum.a += dot(weights[pos+6], rgbd) + dot(weights[pos+7], in1);
-    pos += 8;  // 4 channels × 2 vec4s per channel
-  }
-}
-```
-
-**Benefits:**
-- **SIMD-native:** GPU executes `dot(vec4, vec4)` as single instruction (4 parallel MADs)
-- **Memory bandwidth:** 2 vec4 loads vs 8 scalar loads (better cache alignment)
-- **Bias integration:** Free via `[..., 1.0]` component (no separate add)
-- **Code simplicity:** Eliminates inner loop, direct indexing with `pos`
-- **Performance:** 2-3× GPU throughput improvement over scalar version
-
-**Weight layout per filter (8 floats):**
-- vec4[0]: [w_r, w_g, w_b, w_d]     (rgba input weights)
-- vec4[1]: [w_u, w_v, w_gray, bias] (uv, grayscale, bias)
-
-**3×3 kernel sizes:**
-- Inner layer (7→4): 72 vec4s (9 pos × 4 ch × 2 vec4 = 2304 bytes)
-- Final layer (7→1): 18 vec4s (9 pos × 1 ch × 2 vec4 = 288 bytes)
-
----
-
-## References
-
-- **Training Script:** `training/train_cnn.py`
-- **Shader Composition:** `doc/SEQUENCE.md`
-- **Effect System:** `src/gpu/effect.h`
diff --git a/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md b/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md
index bf63c5d..8664157 100644
--- a/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md
+++ b/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md
@@ -183,7 +183,7 @@ These yield better size/performance than shader architecture changes.
 
 ## References
 
-- `doc/CNN_EFFECT.md` - CNN implementation details
-- `doc/CNN.md` - High-level CNN design
-- `src/effects/cnn_effect.cc` - Current implementation
+- `CNN_V1_EFFECT.md` - CNN implementation details
+- `CNN.md` - High-level CNN design
+- `../src/cnn_effect.cc` - Current implementation
 - `workspaces/main/shaders/cnn_*.wgsl` - Shader snippets
diff --git a/cnn_v1/docs/CNN_V1_EFFECT.md b/cnn_v1/docs/CNN_V1_EFFECT.md
new file mode 100644
index 0000000..40f095e
--- /dev/null
+++ b/cnn_v1/docs/CNN_V1_EFFECT.md
@@ -0,0 +1,400 @@
+# CNN Post-Processing Effect
+
+Neural network-based stylization for rendered scenes.
+
+---
+
+## Overview
+
+Trainable convolutional neural network layers for artistic stylization (painterly, sketch, cel-shaded effects) with minimal runtime overhead.
+
+**Key Features:**
+- Position-aware layer 0 (coordinate input for vignetting, edge effects)
+- Multi-layer convolutions (3×3, 5×5, 7×7 kernels) with automatic chaining
+- Original input available to all layers via framebuffer capture
+- Configurable final blend with original scene
+- Modular WGSL shader architecture
+- Hardcoded weights (trained offline via PyTorch)
+- ~5-8 KB binary footprint
+
+---
+
+## Architecture
+
+### RGBD → Grayscale Pipeline
+
+**Input:** RGBD (RGB + inverse depth D=1/z)
+**Output:** Grayscale (1 channel)
+**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1]
+
+**Architecture:**
+- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD
+- **Final layer (N-1):** Conv2d(7→1) - output grayscale
+
+```wgsl
+// Inner layers: 7→4 (RGBD output, vec4-optimized)
+fn cnn_conv3x3_7to4(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  gray: f32,                               # Grayscale [-1,1]
+  weights: array<vec4<f32>, 72>           # 9 pos × 4 ch × 2 vec4 (8 floats per filter)
+) -> vec4<f32>
+
+// Final layer: 7→1 (grayscale output, vec4-optimized)
+fn cnn_conv3x3_7to1(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  gray: f32,
+  weights: array<vec4<f32>, 18>           # 9 pos × 2 vec4 (8 floats per filter)
+) -> f32
+```
+
+**Input normalization:**
+- **fs_main** normalizes textures once: `(tex - 0.5) * 2` → [-1,1]
+- **Conv functions** normalize UV coords: `(uv - 0.5) * 2` → [-1,1]
+- **Grayscale** computed once in fs_main using dot product: `dot(original.rgb, vec3(0.2126, 0.7152, 0.0722))`
+- **Inter-layer data** stays in [-1,1] (no denormalization)
+- **Final output** denormalized for display: `(result + 1.0) * 0.5` → [0,1]
+
+**Activation:** tanh for inner layers (output stays [-1,1]), none for final layer
+
+### Multi-Layer Architecture
+
+CNNEffect supports multi-layer networks via automatic effect chaining:
+
+1. **Timeline specifies total layers**: `CNNEffect layers=3 blend=0.7`
+2. **Compiler expands to chain**: 3 separate CNNEffect instances (layer 0→1→2)
+3. **Framebuffer capture**: Layer 0 captures original input to `"captured_frame"`
+4. **Original input binding**: All layers access original via `@binding(4)`
+5. **Final blend**: Last layer blends result with original: `mix(original, result, 0.7)`
+
+**Framebuffer Capture API:**
+- `Effect::needs_framebuffer_capture()` - effect requests pre-capture
+- MainSequence automatically blits input → `"captured_frame"` auxiliary texture
+- Generic mechanism usable by any effect
+
+### File Structure
+
+```
+src/effects/
+  cnn_effect.h/cc         # CNNEffect class + framebuffer capture
+
+workspaces/main/shaders/cnn/
+  cnn_activation.wgsl     # tanh, ReLU, sigmoid, leaky_relu
+  cnn_conv3x3.wgsl        # 3×3 convolution (standard + coord-aware)
+  cnn_conv5x5.wgsl        # 5×5 convolution (standard + coord-aware)
+  cnn_conv7x7.wgsl        # 7×7 convolution (standard + coord-aware)
+  cnn_weights_generated.wgsl  # Weight arrays (auto-generated by train_cnn.py)
+  cnn_layer.wgsl          # Main shader with layer switches (auto-generated by train_cnn.py)
+```
+
+---
+
+## Training Workflow
+
+### 1. Prepare Training Data
+
+Input/target image pairs:
+```
+training/input/img_000.png   # RGBA (RGB + alpha)
+training/output/img_000.png  # Grayscale target
+```
+
+**Note:** Alpha channel can be depth (1/z) or constant (255). Network learns from RGB primarily.
+
+### 2. Train Network
+
+**Patch-based (Recommended)** - Preserves natural pixel scale:
+```bash
+python3 training/train_cnn.py \
+  --input training/input --target training/output \
+  --patch-size 32 --patches-per-image 64 --detector harris \
+  --layers 3 --kernel-sizes 3,5,3 \
+  --epochs 5000 --batch-size 16 --checkpoint-every 1000
+```
+
+**Detectors:** `harris` (corners), `fast` (features), `shi-tomasi` (corners), `gradient` (edges)
+
+**Full-image (Legacy)** - Resizes to 256×256:
+```bash
+python3 training/train_cnn.py \
+  --input training/input --target training/output \
+  --layers 3 --kernel-sizes 3,5,3 \
+  --epochs 10000 --batch-size 8 --checkpoint-every 1000
+```
+
+**Auto-generates:**
+- `cnn_weights_generated.wgsl` - Weight arrays
+- `cnn_layer.wgsl` - Layer shader
+
+### 3. Export & Validate
+
+```bash
+# Export shaders
+./training/train_cnn.py --export-only checkpoints/checkpoint_epoch_5000.pth
+
+# Generate ground truth
+./training/train_cnn.py --infer input.png \
+  --export-only checkpoints/checkpoint_epoch_5000.pth --output ground_truth.png
+```
+
+### 4. Rebuild Demo
+
+```bash
+cmake --build build -j4 && ./build/demo64k
+```
+
+---
+
+## Usage
+
+### C++ Integration
+
+**Single layer (manual):**
+```cpp
+#include "effects/cnn_effect.h"
+
+CNNEffectParams p;
+p.layer_index = 0;
+p.total_layers = 1;
+p.blend_amount = 1.0f;
+auto cnn = std::make_shared<CNNEffect>(ctx, p);
+timeline.add_effect(cnn, start_time, end_time);
+```
+
+**Multi-layer (automatic via timeline compiler):**
+
+Use timeline syntax - `seq_compiler` expands to multiple instances.
+
+### Timeline Examples
+
+**Single-layer CNN (full stylization):**
+```
+SEQUENCE 10.0 0
+  EFFECT + Hybrid3DEffect 0.00 5.00
+  EFFECT + CNNEffect 0.50 5.00 layers=1
+```
+
+**Multi-layer CNN with blend:**
+```
+SEQUENCE 10.0 0
+  EFFECT + Hybrid3DEffect 0.00 5.00
+  EFFECT + CNNEffect 0.50 5.00 layers=3 blend=0.7
+```
+
+Expands to:
+```cpp
+// Layer 0 (captures original, blend=1.0)
+{
+  CNNEffectParams p;
+  p.layer_index = 0;
+  p.total_layers = 3;
+  p.blend_amount = 1.0f;
+  seq->add_effect(std::make_shared<CNNEffect>(ctx, p), 0.50f, 5.00f, 1);
+}
+// Layer 1 (blend=1.0)
+{
+  CNNEffectParams p;
+  p.layer_index = 1;
+  p.total_layers = 3;
+  p.blend_amount = 1.0f;
+  seq->add_effect(std::make_shared<CNNEffect>(ctx, p), 0.50f, 5.00f, 2);
+}
+// Layer 2 (final blend=0.7)
+{
+  CNNEffectParams p;
+  p.layer_index = 2;
+  p.total_layers = 3;
+  p.blend_amount = 0.7f;
+  seq->add_effect(std::make_shared<CNNEffect>(ctx, p), 0.50f, 5.00f, 3);
+}
+```
+
+---
+
+## Shader Structure
+
+**Bindings:**
+```wgsl
+@group(0) @binding(0) var smplr: sampler;
+@group(0) @binding(1) var txt: texture_2d<f32>;              // Current layer input
+@group(0) @binding(2) var<uniform> uniforms: CommonUniforms;
+@group(0) @binding(3) var<uniform> params: CNNLayerParams;
+@group(0) @binding(4) var original_input: texture_2d<f32>;   // Layer 0 input (captured)
+```
+
+**Fragment shader logic:**
+```wgsl
+@fragment fn fs_main(@builtin(position) p: vec4<f32>) -> @location(0) vec4<f32> {
+    let uv = p.xy / uniforms.resolution;
+    let original_raw = textureSample(original_input, smplr, uv);
+    let original = (original_raw - 0.5) * 2.0;  // Normalize to [-1,1]
+    let gray = dot(original.rgb, vec3<f32>(0.2126, 0.7152, 0.0722));
+    var result = vec4<f32>(0.0);
+
+    if (params.layer_index == 0) {
+        result = cnn_conv3x3_7to4_src(txt, smplr, uv, uniforms.resolution,
+                                      weights_layer0);
+        result = cnn_tanh(result);
+    }
+    else if (params.layer_index == 1) {
+        result = cnn_conv5x5_7to4(txt, smplr, uv, uniforms.resolution,
+                                   gray, weights_layer1);
+        result = cnn_tanh(result);
+    }
+    // ... other layers
+
+    // Blend with ORIGINAL input (not previous layer)
+    return mix(original_raw, result, params.blend_amount);
+}
+```
+
+**Weight Storage (vec4-optimized):**
+
+**Inner layers (7→4 RGBD output):**
+```wgsl
+// Structure: array<vec4<f32>, 72>
+// 9 pos × 4 ch × 2 vec4 (8 floats per filter: [rgba][uv,gray,1])
+const weights_layer0: array<vec4<f32>, 72> = array(
+  vec4<f32>(w0_r, w0_g, w0_b, w0_d),        // pos0_ch0 (rgba weights)
+  vec4<f32>(w0_u, w0_v, w0_gray, bias0),    // pos0_ch0 (uv, gray, bias)
+  vec4<f32>(w1_r, w1_g, w1_b, w1_d),        // pos0_ch1 (rgba weights)
+  vec4<f32>(w1_u, w1_v, w1_gray, bias1),    // pos0_ch1 (uv, gray, bias)
+  // ... 68 more vec4s
+);
+```
+
+**Final layer (7→1 grayscale output):**
+```wgsl
+// Structure: array<vec4<f32>, 18>
+// 9 pos × 2 vec4 (8 floats per filter: [rgba][uv,gray,1])
+const weights_layerN: array<vec4<f32>, 18> = array(
+  vec4<f32>(w0_r, w0_g, w0_b, w0_d),        // pos0 (rgba weights)
+  vec4<f32>(w0_u, w0_v, w0_gray, bias0),    // pos0 (uv, gray, bias)
+  // ... 16 more vec4s
+);
+```
+
+**Optimization:** Bias integrated as 4th component via `vec4(uv, gray, 1.0)` input. Two dot4 operations replace 8 scalar MADs.
+
+---
+
+## Size Budget
+
+| Component | Size | Notes |
+|-----------|------|-------|
+| Activation functions | ~200 B | 4 functions |
+| Conv3x3 (standard + coord) | ~500 B | Both variants |
+| Conv5x5 (standard + coord) | ~700 B | Both variants |
+| Conv7x7 (standard + coord) | ~900 B | Both variants |
+| Main shader | ~800 B | Layer composition |
+| C++ implementation | ~300 B | Effect class |
+| **Coord weights** | **+32 B** | Per-layer overhead (layer 0 only) |
+| **RGBA weights** | **2-6 KB** | Depends on depth/kernel sizes |
+| **Total** | **5-9 KB** | Acceptable for 64k |
+
+**Optimization strategies:**
+- Quantize weights (float32 → int8)
+- Prune near-zero weights
+- Use separable convolutions
+
+---
+
+## Testing
+
+```bash
+./build/test_demo_effects  # CNN construction/shader tests
+./build/demo64k            # Visual test
+```
+
+---
+
+## Blend Parameter Behavior
+
+**blend_amount** controls final compositing with original:
+- `blend=0.0`: Pure original (no CNN effect)
+- `blend=0.5`: 50% original + 50% CNN
+- `blend=1.0`: Pure CNN output (full stylization)
+
+**Important:** Blend uses captured layer 0 input, not previous layer output.
+
+**Example use cases:**
+- `blend=1.0`: Full stylization (default)
+- `blend=0.7`: Subtle effect preserving original details
+- `blend=0.3`: Light artistic touch
+
+## Troubleshooting
+
+**Shader compilation fails:**
+- Check `cnn_weights_generated.wgsl` syntax
+- Verify snippets registered in `shaders.cc::InitShaderComposer()`
+- Ensure `cnn_layer.wgsl` has 5 bindings (including `original_input`)
+
+**Black/corrupted output:**
+- Weights untrained (identity placeholder)
+- Check `captured_frame` auxiliary texture is registered
+- Verify layer priorities in timeline are sequential
+
+**Wrong blend result:**
+- Ensure layer 0 has `needs_framebuffer_capture() == true`
+- Check MainSequence framebuffer capture logic
+- Verify `original_input` binding is populated
+
+**Training loss not decreasing:**
+- Lower learning rate (`--learning-rate 0.0001`)
+- More epochs (`--epochs 1000`)
+- Check input/target image alignment
+
+---
+
+## Vec4 Optimization
+
+**Architecture:** Weights stored as vec4 pairs for SIMD efficiency.
+
+**Input representation:**
+```wgsl
+let rgbd = textureSample(...);              // vec4: [r, g, b, d]
+let in1 = vec4<f32>(uv_norm, gray, 1.0);   // vec4: [u, v, gray, 1.0]
+```
+
+**Weight indexing:**
+```wgsl
+var pos = 0;  // Direct weight array index
+for (var dy = -1; dy <= 1; dy++) {
+  for (var dx = -1; dx <= 1; dx++) {
+    // Unrolled channel loop (4 output channels)
+    sum.r += dot(weights[pos+0], rgbd) + dot(weights[pos+1], in1);
+    sum.g += dot(weights[pos+2], rgbd) + dot(weights[pos+3], in1);
+    sum.b += dot(weights[pos+4], rgbd) + dot(weights[pos+5], in1);
+    sum.a += dot(weights[pos+6], rgbd) + dot(weights[pos+7], in1);
+    pos += 8;  // 4 channels × 2 vec4s per channel
+  }
+}
+```
+
+**Benefits:**
+- **SIMD-native:** GPU executes `dot(vec4, vec4)` as single instruction (4 parallel MADs)
+- **Memory bandwidth:** 2 vec4 loads vs 8 scalar loads (better cache alignment)
+- **Bias integration:** Free via `[..., 1.0]` component (no separate add)
+- **Code simplicity:** Eliminates inner loop, direct indexing with `pos`
+- **Performance:** 2-3× GPU throughput improvement over scalar version
+
+**Weight layout per filter (8 floats):**
+- vec4[0]: [w_r, w_g, w_b, w_d]     (rgba input weights)
+- vec4[1]: [w_u, w_v, w_gray, bias] (uv, grayscale, bias)
+
+**3×3 kernel sizes:**
+- Inner layer (7→4): 72 vec4s (9 pos × 4 ch × 2 vec4 = 2304 bytes)
+- Final layer (7→1): 18 vec4s (9 pos × 1 ch × 2 vec4 = 288 bytes)
+
+---
+
+## References
+
+- **Training Script:** `training/train_cnn.py`
+- **Shader Composition:** `doc/SEQUENCE.md`
+- **Effect System:** `src/gpu/effect.h`
diff --git a/cnn_v1/src/cnn_effect.cc b/cnn_v1/src/cnn_effect.cc
deleted file mode 100644
index 49c5239..0000000
--- a/cnn_v1/src/cnn_effect.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// CNN post-processing effect implementation
-// Neural network-based stylization with modular WGSL
-
-#include "effects/cnn_effect.h"
-#include "gpu/bind_group_builder.h"
-#include "gpu/effect.h"
-#include "gpu/pipeline_builder.h"
-#include "gpu/post_process_helper.h"
-#include "gpu/sampler_cache.h"
-#include "gpu/shader_composer.h"
-#include "gpu/shaders.h"
-
-// Create custom pipeline with 5 bindings (includes original texture)
-static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
-                                              WGPUTextureFormat format,
-                                              const char* shader_code) {
-  WGPUBindGroupLayout bgl =
-      BindGroupLayoutBuilder()
-          .sampler(0, WGPUShaderStage_Fragment)
-          .texture(1, WGPUShaderStage_Fragment)
-          .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment)
-          .uniform(3, WGPUShaderStage_Fragment)
-          .texture(4, WGPUShaderStage_Fragment)
-          .build(device);
-
-  WGPURenderPipeline pipeline = RenderPipelineBuilder(device)
-                                    .shader(shader_code)
-                                    .bind_group_layout(bgl)
-                                    .format(format)
-                                    .build();
-
-  wgpuBindGroupLayoutRelease(bgl);
-  return pipeline;
-}
-
-CNNEffect::CNNEffect(const GpuContext& ctx)
-    : PostProcessEffect(ctx), layer_index_(0), total_layers_(1),
-      blend_amount_(1.0f), input_view_(nullptr), original_view_(nullptr),
-      bind_group_(nullptr) {
-  pipeline_ =
-      create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl);
-}
-
-CNNEffect::CNNEffect(const GpuContext& ctx, const CNNEffectParams& params)
-    : PostProcessEffect(ctx), layer_index_(params.layer_index),
-      total_layers_(params.total_layers), blend_amount_(params.blend_amount),
-      input_view_(nullptr), original_view_(nullptr), bind_group_(nullptr) {
-  pipeline_ =
-      create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl);
-}
-
-void CNNEffect::init(MainSequence* demo) {
-  PostProcessEffect::init(demo);
-  demo_ = demo;
-  params_buffer_.init(ctx_.device);
-
-  // Register auxiliary texture for layer 0 (width_/height_ set by resize())
-  if (layer_index_ == 0) {
-    demo_->register_auxiliary_texture("captured_frame", width_, height_);
-  }
-
-  // Initialize uniforms BEFORE any bind group creation
-  uniforms_.update(ctx_.queue, get_common_uniforms());
-
-  CNNLayerParams params = {layer_index_, blend_amount_, {0.0f, 0.0f}};
-  params_buffer_.update(ctx_.queue, params);
-}
-
-void CNNEffect::resize(int width, int height) {
-  if (width == width_ && height == height_)
-    return;
-
-  PostProcessEffect::resize(width, height);
-
-  // Only layer 0 owns the captured_frame texture
-  if (layer_index_ == 0 && demo_) {
-    demo_->resize_auxiliary_texture("captured_frame", width, height);
-  }
-}
-
-void CNNEffect::render(WGPURenderPassEncoder pass,
-                       const CommonPostProcessUniforms& uniforms) {
-  if (!bind_group_) {
-    fprintf(stderr, "CNN render: no bind_group\n");
-    return;
-  }
-
-  float effective_blend = blend_amount_;
-  if (beat_modulated_) {
-    effective_blend = blend_amount_ * uniforms.beat_phase * beat_scale_;
-  }
-
-  CNNLayerParams params = {layer_index_, effective_blend, {0.0f, 0.0f}};
-  params_buffer_.update(ctx_.queue, params);
-
-  wgpuRenderPassEncoderSetPipeline(pass, pipeline_);
-  wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr);
-  wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
-}
-
-void CNNEffect::update_bind_group(WGPUTextureView input_view) {
-  input_view_ = input_view;
-
-  // Update common uniforms (CRITICAL for UV calculation!)
-  uniforms_.update(ctx_.queue, get_common_uniforms());
-
-  // All layers: get captured frame (original input from layer 0)
-  if (demo_) {
-    original_view_ = demo_->get_auxiliary_view("captured_frame");
-  }
-
-  // Create bind group with original texture
-  if (bind_group_)
-    wgpuBindGroupRelease(bind_group_);
-
-  WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_, 0);
-  // Use clamp (not repeat) to match PyTorch Conv2d zero-padding behavior
-  WGPUSampler sampler =
-      SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::clamp());
-
-  bind_group_ =
-      BindGroupBuilder()
-          .sampler(0, sampler)
-          .texture(1, input_view_)
-          .buffer(2, uniforms_.get().buffer, uniforms_.get().size)
-          .buffer(3, params_buffer_.get().buffer, params_buffer_.get().size)
-          .texture(4, original_view_ ? original_view_ : input_view_)
-          .build(ctx_.device, bgl);
-}
diff --git a/cnn_v1/src/cnn_effect.h b/cnn_v1/src/cnn_effect.h
deleted file mode 100644
index cdcd656..0000000
--- a/cnn_v1/src/cnn_effect.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// CNN post-processing effect header
-// Multi-layer neural network stylization
-
-#pragma once
-#include "gpu/effect.h"
-#include "gpu/uniform_helper.h"
-
-struct CNNLayerParams {
-  int layer_index;
-  float blend_amount; // Blend: mix(input, output, blend_amount)
-  float _pad[2];
-};
-static_assert(sizeof(CNNLayerParams) == 16);
-
-struct CNNEffectParams {
-  int layer_index = 0;       // Which layer to render (0-based)
-  int total_layers = 1;      // Total number of layers in the CNN
-  float blend_amount = 1.0f; // Final blend with original input
-};
-
-class CNNEffect : public PostProcessEffect {
- public:
-  explicit CNNEffect(const GpuContext& ctx);
-  explicit CNNEffect(const GpuContext& ctx, const CNNEffectParams& params);
-
-  void init(MainSequence* demo) override;
-  void resize(int width, int height) override;
-  void render(WGPURenderPassEncoder pass,
-              const CommonPostProcessUniforms& uniforms) override;
-  void update_bind_group(WGPUTextureView input_view) override;
-
-  // Layer 0 needs framebuffer capture for original input
-  bool needs_framebuffer_capture() const override {
-    return layer_index_ == 0;
-  }
-
-  void set_beat_modulation(bool enabled, float scale = 1.0f) {
-    beat_modulated_ = enabled;
-    beat_scale_ = scale;
-  }
-
- private:
-  int layer_index_;
-  int total_layers_;
-  float blend_amount_;
-  bool beat_modulated_ = false;
-  float beat_scale_ = 1.0f;
-  WGPUTextureView input_view_;
-  WGPUTextureView original_view_;
-  UniformBuffer<CNNLayerParams> params_buffer_;
-  WGPUBindGroup bind_group_;
-  MainSequence* demo_ = nullptr;
-};
diff --git a/cnn_v1/src/cnn_v1_effect.cc b/cnn_v1/src/cnn_v1_effect.cc
new file mode 100644
index 0000000..1f44619
--- /dev/null
+++ b/cnn_v1/src/cnn_v1_effect.cc
@@ -0,0 +1,129 @@
+// CNN post-processing effect implementation
+// Neural network-based stylization with modular WGSL
+
+#include "cnn_v1_effect.h"
+#include "gpu/bind_group_builder.h"
+#include "gpu/effect.h"
+#include "gpu/pipeline_builder.h"
+#include "gpu/post_process_helper.h"
+#include "gpu/sampler_cache.h"
+#include "gpu/shader_composer.h"
+#include "gpu/shaders.h"
+
+// Create custom pipeline with 5 bindings (includes original texture)
+static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
+                                              WGPUTextureFormat format,
+                                              const char* shader_code) {
+  WGPUBindGroupLayout bgl =
+      BindGroupLayoutBuilder()
+          .sampler(0, WGPUShaderStage_Fragment)
+          .texture(1, WGPUShaderStage_Fragment)
+          .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment)
+          .uniform(3, WGPUShaderStage_Fragment)
+          .texture(4, WGPUShaderStage_Fragment)
+          .build(device);
+
+  WGPURenderPipeline pipeline = RenderPipelineBuilder(device)
+                                    .shader(shader_code)
+                                    .bind_group_layout(bgl)
+                                    .format(format)
+                                    .build();
+
+  wgpuBindGroupLayoutRelease(bgl);
+  return pipeline;
+}
+
+CNNv1Effect::CNNv1Effect(const GpuContext& ctx)
+    : PostProcessEffect(ctx), layer_index_(0), total_layers_(1),
+      blend_amount_(1.0f), input_view_(nullptr), original_view_(nullptr),
+      bind_group_(nullptr) {
+  pipeline_ =
+      create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl);
+}
+
+CNNv1Effect::CNNv1Effect(const GpuContext& ctx, const CNNv1EffectParams& params)
+    : PostProcessEffect(ctx), layer_index_(params.layer_index),
+      total_layers_(params.total_layers), blend_amount_(params.blend_amount),
+      input_view_(nullptr), original_view_(nullptr), bind_group_(nullptr) {
+  pipeline_ =
+      create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl);
+}
+
+void CNNv1Effect::init(MainSequence* demo) {
+  PostProcessEffect::init(demo);
+  demo_ = demo;
+  params_buffer_.init(ctx_.device);
+
+  // Register auxiliary texture for layer 0 (width_/height_ set by resize())
+  if (layer_index_ == 0) {
+    demo_->register_auxiliary_texture("captured_frame", width_, height_);
+  }
+
+  // Initialize uniforms BEFORE any bind group creation
+  uniforms_.update(ctx_.queue, get_common_uniforms());
+
+  CNNv1LayerParams params = {layer_index_, blend_amount_, {0.0f, 0.0f}};
+  params_buffer_.update(ctx_.queue, params);
+}
+
+void CNNv1Effect::resize(int width, int height) {
+  if (width == width_ && height == height_)
+    return;
+
+  PostProcessEffect::resize(width, height);
+
+  // Only layer 0 owns the captured_frame texture
+  if (layer_index_ == 0 && demo_) {
+    demo_->resize_auxiliary_texture("captured_frame", width, height);
+  }
+}
+
+void CNNv1Effect::render(WGPURenderPassEncoder pass,
+                       const CommonPostProcessUniforms& uniforms) {
+  if (!bind_group_) {
+    fprintf(stderr, "CNN render: no bind_group\n");
+    return;
+  }
+
+  float effective_blend = blend_amount_;
+  if (beat_modulated_) {
+    effective_blend = blend_amount_ * uniforms.beat_phase * beat_scale_;
+  }
+
+  CNNv1LayerParams params = {layer_index_, effective_blend, {0.0f, 0.0f}};
+  params_buffer_.update(ctx_.queue, params);
+
+  wgpuRenderPassEncoderSetPipeline(pass, pipeline_);
+  wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr);
+  wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+}
+
+void CNNv1Effect::update_bind_group(WGPUTextureView input_view) {
+  input_view_ = input_view;
+
+  // Update common uniforms (CRITICAL for UV calculation!)
+  uniforms_.update(ctx_.queue, get_common_uniforms());
+
+  // All layers: get captured frame (original input from layer 0)
+  if (demo_) {
+    original_view_ = demo_->get_auxiliary_view("captured_frame");
+  }
+
+  // Create bind group with original texture
+  if (bind_group_)
+    wgpuBindGroupRelease(bind_group_);
+
+  WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_, 0);
+  // Use clamp (not repeat) to match PyTorch Conv2d zero-padding behavior
+  WGPUSampler sampler =
+      SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::clamp());
+
+  bind_group_ =
+      BindGroupBuilder()
+          .sampler(0, sampler)
+          .texture(1, input_view_)
+          .buffer(2, uniforms_.get().buffer, uniforms_.get().size)
+          .buffer(3, params_buffer_.get().buffer, params_buffer_.get().size)
+          .texture(4, original_view_ ? original_view_ : input_view_)
+          .build(ctx_.device, bgl);
+}
diff --git a/cnn_v1/src/cnn_v1_effect.h b/cnn_v1/src/cnn_v1_effect.h
new file mode 100644
index 0000000..e820275
--- /dev/null
+++ b/cnn_v1/src/cnn_v1_effect.h
@@ -0,0 +1,53 @@
+// CNN post-processing effect header
+// Multi-layer neural network stylization
+
+#pragma once
+#include "gpu/effect.h"
+#include "gpu/uniform_helper.h"
+
+struct CNNv1LayerParams {
+  int layer_index;
+  float blend_amount; // Blend: mix(input, output, blend_amount)
+  float _pad[2];
+};
+static_assert(sizeof(CNNv1LayerParams) == 16);
+
+struct CNNv1EffectParams {
+  int layer_index = 0;       // Which layer to render (0-based)
+  int total_layers = 1;      // Total number of layers in the CNN
+  float blend_amount = 1.0f; // Final blend with original input
+};
+
+class CNNv1Effect : public PostProcessEffect {
+ public:
+  explicit CNNv1Effect(const GpuContext& ctx);
+  explicit CNNv1Effect(const GpuContext& ctx, const CNNv1EffectParams& params);
+
+  void init(MainSequence* demo) override;
+  void resize(int width, int height) override;
+  void render(WGPURenderPassEncoder pass,
+              const CommonPostProcessUniforms& uniforms) override;
+  void update_bind_group(WGPUTextureView input_view) override;
+
+  // Layer 0 needs framebuffer capture for original input
+  bool needs_framebuffer_capture() const override {
+    return layer_index_ == 0;
+  }
+
+  void set_beat_modulation(bool enabled, float scale = 1.0f) {
+    beat_modulated_ = enabled;
+    beat_scale_ = scale;
+  }
+
+ private:
+  int layer_index_;
+  int total_layers_;
+  float blend_amount_;
+  bool beat_modulated_ = false;
+  float beat_scale_ = 1.0f;
+  WGPUTextureView input_view_;
+  WGPUTextureView original_view_;
+  UniformBuffer<CNNv1LayerParams> params_buffer_;
+  WGPUBindGroup bind_group_;
+  MainSequence* demo_ = nullptr;
+};
-- 
cgit v1.2.3