From 2f8810f303d06fe78dbec343553c3c97f93f9323 Mon Sep 17 00:00:00 2001 From: skal Date: Sun, 15 Feb 2026 18:55:42 +0100 Subject: refactor(cnn): rename cnn_effect to cnn_v1_effect for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renamed files and classes: - cnn_effect.{h,cc} → cnn_v1_effect.{h,cc} - CNNEffect → CNNv1Effect - CNNEffectParams → CNNv1EffectParams - CNNLayerParams → CNNv1LayerParams - CNN_EFFECT.md → CNN_V1_EFFECT.md Updated all references: - C++ includes and class usage - CMake source list - Timeline (workspaces/main/timeline.seq) - Test file (test_demo_effects.cc) - Documentation (CLAUDE.md, PROJECT_CONTEXT.md, READMEs) Tests: 34/34 passing (100%) --- cnn_v1/README.md | 2 +- cnn_v1/docs/CNN.md | 2 +- cnn_v1/docs/CNN_EFFECT.md | 400 ------------------------------------ cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md | 6 +- cnn_v1/docs/CNN_V1_EFFECT.md | 400 ++++++++++++++++++++++++++++++++++++ cnn_v1/src/cnn_effect.cc | 129 ------------ cnn_v1/src/cnn_effect.h | 53 ----- cnn_v1/src/cnn_v1_effect.cc | 129 ++++++++++++ cnn_v1/src/cnn_v1_effect.h | 53 +++++ 9 files changed, 587 insertions(+), 587 deletions(-) delete mode 100644 cnn_v1/docs/CNN_EFFECT.md create mode 100644 cnn_v1/docs/CNN_V1_EFFECT.md delete mode 100644 cnn_v1/src/cnn_effect.cc delete mode 100644 cnn_v1/src/cnn_effect.h create mode 100644 cnn_v1/src/cnn_v1_effect.cc create mode 100644 cnn_v1/src/cnn_v1_effect.h (limited to 'cnn_v1') diff --git a/cnn_v1/README.md b/cnn_v1/README.md index ad9b8f3..052f22a 100644 --- a/cnn_v1/README.md +++ b/cnn_v1/README.md @@ -29,7 +29,7 @@ Original CNN implementation with per-layer WGSL shaders. Supports multiple kerne ## Documentation - [CNN.md](docs/CNN.md) - Architecture overview -- [CNN_EFFECT.md](docs/CNN_EFFECT.md) - Implementation details +- [CNN_V1_EFFECT.md](docs/CNN_V1_EFFECT.md) - Implementation details - [CNN_TEST_TOOL.md](docs/CNN_TEST_TOOL.md) - Testing guide - [CNN_DEBUG.md](docs/CNN_DEBUG.md) - Debugging notes diff --git a/cnn_v1/docs/CNN.md b/cnn_v1/docs/CNN.md index 2dc3362..5d9a667 100644 --- a/cnn_v1/docs/CNN.md +++ b/cnn_v1/docs/CNN.md @@ -8,7 +8,7 @@ Have the input 3d scene be processed by a multi-layer CNN trained on the side. Input: some rendered scene. Output: 'stylized' scene with CNN post-processing. -**See `doc/CNN_EFFECT.md` for implementation details, usage, and API reference.** +**See `CNN_V1_EFFECT.md` for implementation details, usage, and API reference.** ## Shader implementation diff --git a/cnn_v1/docs/CNN_EFFECT.md b/cnn_v1/docs/CNN_EFFECT.md deleted file mode 100644 index 40f095e..0000000 --- a/cnn_v1/docs/CNN_EFFECT.md +++ /dev/null @@ -1,400 +0,0 @@ -# CNN Post-Processing Effect - -Neural network-based stylization for rendered scenes. - ---- - -## Overview - -Trainable convolutional neural network layers for artistic stylization (painterly, sketch, cel-shaded effects) with minimal runtime overhead. - -**Key Features:** -- Position-aware layer 0 (coordinate input for vignetting, edge effects) -- Multi-layer convolutions (3×3, 5×5, 7×7 kernels) with automatic chaining -- Original input available to all layers via framebuffer capture -- Configurable final blend with original scene -- Modular WGSL shader architecture -- Hardcoded weights (trained offline via PyTorch) -- ~5-8 KB binary footprint - ---- - -## Architecture - -### RGBD → Grayscale Pipeline - -**Input:** RGBD (RGB + inverse depth D=1/z) -**Output:** Grayscale (1 channel) -**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1] - -**Architecture:** -- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD -- **Final layer (N-1):** Conv2d(7→1) - output grayscale - -```wgsl -// Inner layers: 7→4 (RGBD output, vec4-optimized) -fn cnn_conv3x3_7to4( - tex: texture_2d, - samp: sampler, - uv: vec2, - resolution: vec2, - gray: f32, # Grayscale [-1,1] - weights: array, 72> # 9 pos × 4 ch × 2 vec4 (8 floats per filter) -) -> vec4 - -// Final layer: 7→1 (grayscale output, vec4-optimized) -fn cnn_conv3x3_7to1( - tex: texture_2d, - samp: sampler, - uv: vec2, - resolution: vec2, - gray: f32, - weights: array, 18> # 9 pos × 2 vec4 (8 floats per filter) -) -> f32 -``` - -**Input normalization:** -- **fs_main** normalizes textures once: `(tex - 0.5) * 2` → [-1,1] -- **Conv functions** normalize UV coords: `(uv - 0.5) * 2` → [-1,1] -- **Grayscale** computed once in fs_main using dot product: `dot(original.rgb, vec3(0.2126, 0.7152, 0.0722))` -- **Inter-layer data** stays in [-1,1] (no denormalization) -- **Final output** denormalized for display: `(result + 1.0) * 0.5` → [0,1] - -**Activation:** tanh for inner layers (output stays [-1,1]), none for final layer - -### Multi-Layer Architecture - -CNNEffect supports multi-layer networks via automatic effect chaining: - -1. **Timeline specifies total layers**: `CNNEffect layers=3 blend=0.7` -2. **Compiler expands to chain**: 3 separate CNNEffect instances (layer 0→1→2) -3. **Framebuffer capture**: Layer 0 captures original input to `"captured_frame"` -4. **Original input binding**: All layers access original via `@binding(4)` -5. **Final blend**: Last layer blends result with original: `mix(original, result, 0.7)` - -**Framebuffer Capture API:** -- `Effect::needs_framebuffer_capture()` - effect requests pre-capture -- MainSequence automatically blits input → `"captured_frame"` auxiliary texture -- Generic mechanism usable by any effect - -### File Structure - -``` -src/effects/ - cnn_effect.h/cc # CNNEffect class + framebuffer capture - -workspaces/main/shaders/cnn/ - cnn_activation.wgsl # tanh, ReLU, sigmoid, leaky_relu - cnn_conv3x3.wgsl # 3×3 convolution (standard + coord-aware) - cnn_conv5x5.wgsl # 5×5 convolution (standard + coord-aware) - cnn_conv7x7.wgsl # 7×7 convolution (standard + coord-aware) - cnn_weights_generated.wgsl # Weight arrays (auto-generated by train_cnn.py) - cnn_layer.wgsl # Main shader with layer switches (auto-generated by train_cnn.py) -``` - ---- - -## Training Workflow - -### 1. Prepare Training Data - -Input/target image pairs: -``` -training/input/img_000.png # RGBA (RGB + alpha) -training/output/img_000.png # Grayscale target -``` - -**Note:** Alpha channel can be depth (1/z) or constant (255). Network learns from RGB primarily. - -### 2. Train Network - -**Patch-based (Recommended)** - Preserves natural pixel scale: -```bash -python3 training/train_cnn.py \ - --input training/input --target training/output \ - --patch-size 32 --patches-per-image 64 --detector harris \ - --layers 3 --kernel-sizes 3,5,3 \ - --epochs 5000 --batch-size 16 --checkpoint-every 1000 -``` - -**Detectors:** `harris` (corners), `fast` (features), `shi-tomasi` (corners), `gradient` (edges) - -**Full-image (Legacy)** - Resizes to 256×256: -```bash -python3 training/train_cnn.py \ - --input training/input --target training/output \ - --layers 3 --kernel-sizes 3,5,3 \ - --epochs 10000 --batch-size 8 --checkpoint-every 1000 -``` - -**Auto-generates:** -- `cnn_weights_generated.wgsl` - Weight arrays -- `cnn_layer.wgsl` - Layer shader - -### 3. Export & Validate - -```bash -# Export shaders -./training/train_cnn.py --export-only checkpoints/checkpoint_epoch_5000.pth - -# Generate ground truth -./training/train_cnn.py --infer input.png \ - --export-only checkpoints/checkpoint_epoch_5000.pth --output ground_truth.png -``` - -### 4. Rebuild Demo - -```bash -cmake --build build -j4 && ./build/demo64k -``` - ---- - -## Usage - -### C++ Integration - -**Single layer (manual):** -```cpp -#include "effects/cnn_effect.h" - -CNNEffectParams p; -p.layer_index = 0; -p.total_layers = 1; -p.blend_amount = 1.0f; -auto cnn = std::make_shared(ctx, p); -timeline.add_effect(cnn, start_time, end_time); -``` - -**Multi-layer (automatic via timeline compiler):** - -Use timeline syntax - `seq_compiler` expands to multiple instances. - -### Timeline Examples - -**Single-layer CNN (full stylization):** -``` -SEQUENCE 10.0 0 - EFFECT + Hybrid3DEffect 0.00 5.00 - EFFECT + CNNEffect 0.50 5.00 layers=1 -``` - -**Multi-layer CNN with blend:** -``` -SEQUENCE 10.0 0 - EFFECT + Hybrid3DEffect 0.00 5.00 - EFFECT + CNNEffect 0.50 5.00 layers=3 blend=0.7 -``` - -Expands to: -```cpp -// Layer 0 (captures original, blend=1.0) -{ - CNNEffectParams p; - p.layer_index = 0; - p.total_layers = 3; - p.blend_amount = 1.0f; - seq->add_effect(std::make_shared(ctx, p), 0.50f, 5.00f, 1); -} -// Layer 1 (blend=1.0) -{ - CNNEffectParams p; - p.layer_index = 1; - p.total_layers = 3; - p.blend_amount = 1.0f; - seq->add_effect(std::make_shared(ctx, p), 0.50f, 5.00f, 2); -} -// Layer 2 (final blend=0.7) -{ - CNNEffectParams p; - p.layer_index = 2; - p.total_layers = 3; - p.blend_amount = 0.7f; - seq->add_effect(std::make_shared(ctx, p), 0.50f, 5.00f, 3); -} -``` - ---- - -## Shader Structure - -**Bindings:** -```wgsl -@group(0) @binding(0) var smplr: sampler; -@group(0) @binding(1) var txt: texture_2d; // Current layer input -@group(0) @binding(2) var uniforms: CommonUniforms; -@group(0) @binding(3) var params: CNNLayerParams; -@group(0) @binding(4) var original_input: texture_2d; // Layer 0 input (captured) -``` - -**Fragment shader logic:** -```wgsl -@fragment fn fs_main(@builtin(position) p: vec4) -> @location(0) vec4 { - let uv = p.xy / uniforms.resolution; - let original_raw = textureSample(original_input, smplr, uv); - let original = (original_raw - 0.5) * 2.0; // Normalize to [-1,1] - let gray = dot(original.rgb, vec3(0.2126, 0.7152, 0.0722)); - var result = vec4(0.0); - - if (params.layer_index == 0) { - result = cnn_conv3x3_7to4_src(txt, smplr, uv, uniforms.resolution, - weights_layer0); - result = cnn_tanh(result); - } - else if (params.layer_index == 1) { - result = cnn_conv5x5_7to4(txt, smplr, uv, uniforms.resolution, - gray, weights_layer1); - result = cnn_tanh(result); - } - // ... other layers - - // Blend with ORIGINAL input (not previous layer) - return mix(original_raw, result, params.blend_amount); -} -``` - -**Weight Storage (vec4-optimized):** - -**Inner layers (7→4 RGBD output):** -```wgsl -// Structure: array, 72> -// 9 pos × 4 ch × 2 vec4 (8 floats per filter: [rgba][uv,gray,1]) -const weights_layer0: array, 72> = array( - vec4(w0_r, w0_g, w0_b, w0_d), // pos0_ch0 (rgba weights) - vec4(w0_u, w0_v, w0_gray, bias0), // pos0_ch0 (uv, gray, bias) - vec4(w1_r, w1_g, w1_b, w1_d), // pos0_ch1 (rgba weights) - vec4(w1_u, w1_v, w1_gray, bias1), // pos0_ch1 (uv, gray, bias) - // ... 68 more vec4s -); -``` - -**Final layer (7→1 grayscale output):** -```wgsl -// Structure: array, 18> -// 9 pos × 2 vec4 (8 floats per filter: [rgba][uv,gray,1]) -const weights_layerN: array, 18> = array( - vec4(w0_r, w0_g, w0_b, w0_d), // pos0 (rgba weights) - vec4(w0_u, w0_v, w0_gray, bias0), // pos0 (uv, gray, bias) - // ... 16 more vec4s -); -``` - -**Optimization:** Bias integrated as 4th component via `vec4(uv, gray, 1.0)` input. Two dot4 operations replace 8 scalar MADs. - ---- - -## Size Budget - -| Component | Size | Notes | -|-----------|------|-------| -| Activation functions | ~200 B | 4 functions | -| Conv3x3 (standard + coord) | ~500 B | Both variants | -| Conv5x5 (standard + coord) | ~700 B | Both variants | -| Conv7x7 (standard + coord) | ~900 B | Both variants | -| Main shader | ~800 B | Layer composition | -| C++ implementation | ~300 B | Effect class | -| **Coord weights** | **+32 B** | Per-layer overhead (layer 0 only) | -| **RGBA weights** | **2-6 KB** | Depends on depth/kernel sizes | -| **Total** | **5-9 KB** | Acceptable for 64k | - -**Optimization strategies:** -- Quantize weights (float32 → int8) -- Prune near-zero weights -- Use separable convolutions - ---- - -## Testing - -```bash -./build/test_demo_effects # CNN construction/shader tests -./build/demo64k # Visual test -``` - ---- - -## Blend Parameter Behavior - -**blend_amount** controls final compositing with original: -- `blend=0.0`: Pure original (no CNN effect) -- `blend=0.5`: 50% original + 50% CNN -- `blend=1.0`: Pure CNN output (full stylization) - -**Important:** Blend uses captured layer 0 input, not previous layer output. - -**Example use cases:** -- `blend=1.0`: Full stylization (default) -- `blend=0.7`: Subtle effect preserving original details -- `blend=0.3`: Light artistic touch - -## Troubleshooting - -**Shader compilation fails:** -- Check `cnn_weights_generated.wgsl` syntax -- Verify snippets registered in `shaders.cc::InitShaderComposer()` -- Ensure `cnn_layer.wgsl` has 5 bindings (including `original_input`) - -**Black/corrupted output:** -- Weights untrained (identity placeholder) -- Check `captured_frame` auxiliary texture is registered -- Verify layer priorities in timeline are sequential - -**Wrong blend result:** -- Ensure layer 0 has `needs_framebuffer_capture() == true` -- Check MainSequence framebuffer capture logic -- Verify `original_input` binding is populated - -**Training loss not decreasing:** -- Lower learning rate (`--learning-rate 0.0001`) -- More epochs (`--epochs 1000`) -- Check input/target image alignment - ---- - -## Vec4 Optimization - -**Architecture:** Weights stored as vec4 pairs for SIMD efficiency. - -**Input representation:** -```wgsl -let rgbd = textureSample(...); // vec4: [r, g, b, d] -let in1 = vec4(uv_norm, gray, 1.0); // vec4: [u, v, gray, 1.0] -``` - -**Weight indexing:** -```wgsl -var pos = 0; // Direct weight array index -for (var dy = -1; dy <= 1; dy++) { - for (var dx = -1; dx <= 1; dx++) { - // Unrolled channel loop (4 output channels) - sum.r += dot(weights[pos+0], rgbd) + dot(weights[pos+1], in1); - sum.g += dot(weights[pos+2], rgbd) + dot(weights[pos+3], in1); - sum.b += dot(weights[pos+4], rgbd) + dot(weights[pos+5], in1); - sum.a += dot(weights[pos+6], rgbd) + dot(weights[pos+7], in1); - pos += 8; // 4 channels × 2 vec4s per channel - } -} -``` - -**Benefits:** -- **SIMD-native:** GPU executes `dot(vec4, vec4)` as single instruction (4 parallel MADs) -- **Memory bandwidth:** 2 vec4 loads vs 8 scalar loads (better cache alignment) -- **Bias integration:** Free via `[..., 1.0]` component (no separate add) -- **Code simplicity:** Eliminates inner loop, direct indexing with `pos` -- **Performance:** 2-3× GPU throughput improvement over scalar version - -**Weight layout per filter (8 floats):** -- vec4[0]: [w_r, w_g, w_b, w_d] (rgba input weights) -- vec4[1]: [w_u, w_v, w_gray, bias] (uv, grayscale, bias) - -**3×3 kernel sizes:** -- Inner layer (7→4): 72 vec4s (9 pos × 4 ch × 2 vec4 = 2304 bytes) -- Final layer (7→1): 18 vec4s (9 pos × 1 ch × 2 vec4 = 288 bytes) - ---- - -## References - -- **Training Script:** `training/train_cnn.py` -- **Shader Composition:** `doc/SEQUENCE.md` -- **Effect System:** `src/gpu/effect.h` diff --git a/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md b/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md index bf63c5d..8664157 100644 --- a/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md +++ b/cnn_v1/docs/CNN_FLATTEN_ANALYSIS.md @@ -183,7 +183,7 @@ These yield better size/performance than shader architecture changes. ## References -- `doc/CNN_EFFECT.md` - CNN implementation details -- `doc/CNN.md` - High-level CNN design -- `src/effects/cnn_effect.cc` - Current implementation +- `CNN_V1_EFFECT.md` - CNN implementation details +- `CNN.md` - High-level CNN design +- `../src/cnn_effect.cc` - Current implementation - `workspaces/main/shaders/cnn_*.wgsl` - Shader snippets diff --git a/cnn_v1/docs/CNN_V1_EFFECT.md b/cnn_v1/docs/CNN_V1_EFFECT.md new file mode 100644 index 0000000..40f095e --- /dev/null +++ b/cnn_v1/docs/CNN_V1_EFFECT.md @@ -0,0 +1,400 @@ +# CNN Post-Processing Effect + +Neural network-based stylization for rendered scenes. + +--- + +## Overview + +Trainable convolutional neural network layers for artistic stylization (painterly, sketch, cel-shaded effects) with minimal runtime overhead. + +**Key Features:** +- Position-aware layer 0 (coordinate input for vignetting, edge effects) +- Multi-layer convolutions (3×3, 5×5, 7×7 kernels) with automatic chaining +- Original input available to all layers via framebuffer capture +- Configurable final blend with original scene +- Modular WGSL shader architecture +- Hardcoded weights (trained offline via PyTorch) +- ~5-8 KB binary footprint + +--- + +## Architecture + +### RGBD → Grayscale Pipeline + +**Input:** RGBD (RGB + inverse depth D=1/z) +**Output:** Grayscale (1 channel) +**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1] + +**Architecture:** +- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD +- **Final layer (N-1):** Conv2d(7→1) - output grayscale + +```wgsl +// Inner layers: 7→4 (RGBD output, vec4-optimized) +fn cnn_conv3x3_7to4( + tex: texture_2d, + samp: sampler, + uv: vec2, + resolution: vec2, + gray: f32, # Grayscale [-1,1] + weights: array, 72> # 9 pos × 4 ch × 2 vec4 (8 floats per filter) +) -> vec4 + +// Final layer: 7→1 (grayscale output, vec4-optimized) +fn cnn_conv3x3_7to1( + tex: texture_2d, + samp: sampler, + uv: vec2, + resolution: vec2, + gray: f32, + weights: array, 18> # 9 pos × 2 vec4 (8 floats per filter) +) -> f32 +``` + +**Input normalization:** +- **fs_main** normalizes textures once: `(tex - 0.5) * 2` → [-1,1] +- **Conv functions** normalize UV coords: `(uv - 0.5) * 2` → [-1,1] +- **Grayscale** computed once in fs_main using dot product: `dot(original.rgb, vec3(0.2126, 0.7152, 0.0722))` +- **Inter-layer data** stays in [-1,1] (no denormalization) +- **Final output** denormalized for display: `(result + 1.0) * 0.5` → [0,1] + +**Activation:** tanh for inner layers (output stays [-1,1]), none for final layer + +### Multi-Layer Architecture + +CNNEffect supports multi-layer networks via automatic effect chaining: + +1. **Timeline specifies total layers**: `CNNEffect layers=3 blend=0.7` +2. **Compiler expands to chain**: 3 separate CNNEffect instances (layer 0→1→2) +3. **Framebuffer capture**: Layer 0 captures original input to `"captured_frame"` +4. **Original input binding**: All layers access original via `@binding(4)` +5. **Final blend**: Last layer blends result with original: `mix(original, result, 0.7)` + +**Framebuffer Capture API:** +- `Effect::needs_framebuffer_capture()` - effect requests pre-capture +- MainSequence automatically blits input → `"captured_frame"` auxiliary texture +- Generic mechanism usable by any effect + +### File Structure + +``` +src/effects/ + cnn_effect.h/cc # CNNEffect class + framebuffer capture + +workspaces/main/shaders/cnn/ + cnn_activation.wgsl # tanh, ReLU, sigmoid, leaky_relu + cnn_conv3x3.wgsl # 3×3 convolution (standard + coord-aware) + cnn_conv5x5.wgsl # 5×5 convolution (standard + coord-aware) + cnn_conv7x7.wgsl # 7×7 convolution (standard + coord-aware) + cnn_weights_generated.wgsl # Weight arrays (auto-generated by train_cnn.py) + cnn_layer.wgsl # Main shader with layer switches (auto-generated by train_cnn.py) +``` + +--- + +## Training Workflow + +### 1. Prepare Training Data + +Input/target image pairs: +``` +training/input/img_000.png # RGBA (RGB + alpha) +training/output/img_000.png # Grayscale target +``` + +**Note:** Alpha channel can be depth (1/z) or constant (255). Network learns from RGB primarily. + +### 2. Train Network + +**Patch-based (Recommended)** - Preserves natural pixel scale: +```bash +python3 training/train_cnn.py \ + --input training/input --target training/output \ + --patch-size 32 --patches-per-image 64 --detector harris \ + --layers 3 --kernel-sizes 3,5,3 \ + --epochs 5000 --batch-size 16 --checkpoint-every 1000 +``` + +**Detectors:** `harris` (corners), `fast` (features), `shi-tomasi` (corners), `gradient` (edges) + +**Full-image (Legacy)** - Resizes to 256×256: +```bash +python3 training/train_cnn.py \ + --input training/input --target training/output \ + --layers 3 --kernel-sizes 3,5,3 \ + --epochs 10000 --batch-size 8 --checkpoint-every 1000 +``` + +**Auto-generates:** +- `cnn_weights_generated.wgsl` - Weight arrays +- `cnn_layer.wgsl` - Layer shader + +### 3. Export & Validate + +```bash +# Export shaders +./training/train_cnn.py --export-only checkpoints/checkpoint_epoch_5000.pth + +# Generate ground truth +./training/train_cnn.py --infer input.png \ + --export-only checkpoints/checkpoint_epoch_5000.pth --output ground_truth.png +``` + +### 4. Rebuild Demo + +```bash +cmake --build build -j4 && ./build/demo64k +``` + +--- + +## Usage + +### C++ Integration + +**Single layer (manual):** +```cpp +#include "effects/cnn_effect.h" + +CNNEffectParams p; +p.layer_index = 0; +p.total_layers = 1; +p.blend_amount = 1.0f; +auto cnn = std::make_shared(ctx, p); +timeline.add_effect(cnn, start_time, end_time); +``` + +**Multi-layer (automatic via timeline compiler):** + +Use timeline syntax - `seq_compiler` expands to multiple instances. + +### Timeline Examples + +**Single-layer CNN (full stylization):** +``` +SEQUENCE 10.0 0 + EFFECT + Hybrid3DEffect 0.00 5.00 + EFFECT + CNNEffect 0.50 5.00 layers=1 +``` + +**Multi-layer CNN with blend:** +``` +SEQUENCE 10.0 0 + EFFECT + Hybrid3DEffect 0.00 5.00 + EFFECT + CNNEffect 0.50 5.00 layers=3 blend=0.7 +``` + +Expands to: +```cpp +// Layer 0 (captures original, blend=1.0) +{ + CNNEffectParams p; + p.layer_index = 0; + p.total_layers = 3; + p.blend_amount = 1.0f; + seq->add_effect(std::make_shared(ctx, p), 0.50f, 5.00f, 1); +} +// Layer 1 (blend=1.0) +{ + CNNEffectParams p; + p.layer_index = 1; + p.total_layers = 3; + p.blend_amount = 1.0f; + seq->add_effect(std::make_shared(ctx, p), 0.50f, 5.00f, 2); +} +// Layer 2 (final blend=0.7) +{ + CNNEffectParams p; + p.layer_index = 2; + p.total_layers = 3; + p.blend_amount = 0.7f; + seq->add_effect(std::make_shared(ctx, p), 0.50f, 5.00f, 3); +} +``` + +--- + +## Shader Structure + +**Bindings:** +```wgsl +@group(0) @binding(0) var smplr: sampler; +@group(0) @binding(1) var txt: texture_2d; // Current layer input +@group(0) @binding(2) var uniforms: CommonUniforms; +@group(0) @binding(3) var params: CNNLayerParams; +@group(0) @binding(4) var original_input: texture_2d; // Layer 0 input (captured) +``` + +**Fragment shader logic:** +```wgsl +@fragment fn fs_main(@builtin(position) p: vec4) -> @location(0) vec4 { + let uv = p.xy / uniforms.resolution; + let original_raw = textureSample(original_input, smplr, uv); + let original = (original_raw - 0.5) * 2.0; // Normalize to [-1,1] + let gray = dot(original.rgb, vec3(0.2126, 0.7152, 0.0722)); + var result = vec4(0.0); + + if (params.layer_index == 0) { + result = cnn_conv3x3_7to4_src(txt, smplr, uv, uniforms.resolution, + weights_layer0); + result = cnn_tanh(result); + } + else if (params.layer_index == 1) { + result = cnn_conv5x5_7to4(txt, smplr, uv, uniforms.resolution, + gray, weights_layer1); + result = cnn_tanh(result); + } + // ... other layers + + // Blend with ORIGINAL input (not previous layer) + return mix(original_raw, result, params.blend_amount); +} +``` + +**Weight Storage (vec4-optimized):** + +**Inner layers (7→4 RGBD output):** +```wgsl +// Structure: array, 72> +// 9 pos × 4 ch × 2 vec4 (8 floats per filter: [rgba][uv,gray,1]) +const weights_layer0: array, 72> = array( + vec4(w0_r, w0_g, w0_b, w0_d), // pos0_ch0 (rgba weights) + vec4(w0_u, w0_v, w0_gray, bias0), // pos0_ch0 (uv, gray, bias) + vec4(w1_r, w1_g, w1_b, w1_d), // pos0_ch1 (rgba weights) + vec4(w1_u, w1_v, w1_gray, bias1), // pos0_ch1 (uv, gray, bias) + // ... 68 more vec4s +); +``` + +**Final layer (7→1 grayscale output):** +```wgsl +// Structure: array, 18> +// 9 pos × 2 vec4 (8 floats per filter: [rgba][uv,gray,1]) +const weights_layerN: array, 18> = array( + vec4(w0_r, w0_g, w0_b, w0_d), // pos0 (rgba weights) + vec4(w0_u, w0_v, w0_gray, bias0), // pos0 (uv, gray, bias) + // ... 16 more vec4s +); +``` + +**Optimization:** Bias integrated as 4th component via `vec4(uv, gray, 1.0)` input. Two dot4 operations replace 8 scalar MADs. + +--- + +## Size Budget + +| Component | Size | Notes | +|-----------|------|-------| +| Activation functions | ~200 B | 4 functions | +| Conv3x3 (standard + coord) | ~500 B | Both variants | +| Conv5x5 (standard + coord) | ~700 B | Both variants | +| Conv7x7 (standard + coord) | ~900 B | Both variants | +| Main shader | ~800 B | Layer composition | +| C++ implementation | ~300 B | Effect class | +| **Coord weights** | **+32 B** | Per-layer overhead (layer 0 only) | +| **RGBA weights** | **2-6 KB** | Depends on depth/kernel sizes | +| **Total** | **5-9 KB** | Acceptable for 64k | + +**Optimization strategies:** +- Quantize weights (float32 → int8) +- Prune near-zero weights +- Use separable convolutions + +--- + +## Testing + +```bash +./build/test_demo_effects # CNN construction/shader tests +./build/demo64k # Visual test +``` + +--- + +## Blend Parameter Behavior + +**blend_amount** controls final compositing with original: +- `blend=0.0`: Pure original (no CNN effect) +- `blend=0.5`: 50% original + 50% CNN +- `blend=1.0`: Pure CNN output (full stylization) + +**Important:** Blend uses captured layer 0 input, not previous layer output. + +**Example use cases:** +- `blend=1.0`: Full stylization (default) +- `blend=0.7`: Subtle effect preserving original details +- `blend=0.3`: Light artistic touch + +## Troubleshooting + +**Shader compilation fails:** +- Check `cnn_weights_generated.wgsl` syntax +- Verify snippets registered in `shaders.cc::InitShaderComposer()` +- Ensure `cnn_layer.wgsl` has 5 bindings (including `original_input`) + +**Black/corrupted output:** +- Weights untrained (identity placeholder) +- Check `captured_frame` auxiliary texture is registered +- Verify layer priorities in timeline are sequential + +**Wrong blend result:** +- Ensure layer 0 has `needs_framebuffer_capture() == true` +- Check MainSequence framebuffer capture logic +- Verify `original_input` binding is populated + +**Training loss not decreasing:** +- Lower learning rate (`--learning-rate 0.0001`) +- More epochs (`--epochs 1000`) +- Check input/target image alignment + +--- + +## Vec4 Optimization + +**Architecture:** Weights stored as vec4 pairs for SIMD efficiency. + +**Input representation:** +```wgsl +let rgbd = textureSample(...); // vec4: [r, g, b, d] +let in1 = vec4(uv_norm, gray, 1.0); // vec4: [u, v, gray, 1.0] +``` + +**Weight indexing:** +```wgsl +var pos = 0; // Direct weight array index +for (var dy = -1; dy <= 1; dy++) { + for (var dx = -1; dx <= 1; dx++) { + // Unrolled channel loop (4 output channels) + sum.r += dot(weights[pos+0], rgbd) + dot(weights[pos+1], in1); + sum.g += dot(weights[pos+2], rgbd) + dot(weights[pos+3], in1); + sum.b += dot(weights[pos+4], rgbd) + dot(weights[pos+5], in1); + sum.a += dot(weights[pos+6], rgbd) + dot(weights[pos+7], in1); + pos += 8; // 4 channels × 2 vec4s per channel + } +} +``` + +**Benefits:** +- **SIMD-native:** GPU executes `dot(vec4, vec4)` as single instruction (4 parallel MADs) +- **Memory bandwidth:** 2 vec4 loads vs 8 scalar loads (better cache alignment) +- **Bias integration:** Free via `[..., 1.0]` component (no separate add) +- **Code simplicity:** Eliminates inner loop, direct indexing with `pos` +- **Performance:** 2-3× GPU throughput improvement over scalar version + +**Weight layout per filter (8 floats):** +- vec4[0]: [w_r, w_g, w_b, w_d] (rgba input weights) +- vec4[1]: [w_u, w_v, w_gray, bias] (uv, grayscale, bias) + +**3×3 kernel sizes:** +- Inner layer (7→4): 72 vec4s (9 pos × 4 ch × 2 vec4 = 2304 bytes) +- Final layer (7→1): 18 vec4s (9 pos × 1 ch × 2 vec4 = 288 bytes) + +--- + +## References + +- **Training Script:** `training/train_cnn.py` +- **Shader Composition:** `doc/SEQUENCE.md` +- **Effect System:** `src/gpu/effect.h` diff --git a/cnn_v1/src/cnn_effect.cc b/cnn_v1/src/cnn_effect.cc deleted file mode 100644 index 49c5239..0000000 --- a/cnn_v1/src/cnn_effect.cc +++ /dev/null @@ -1,129 +0,0 @@ -// CNN post-processing effect implementation -// Neural network-based stylization with modular WGSL - -#include "effects/cnn_effect.h" -#include "gpu/bind_group_builder.h" -#include "gpu/effect.h" -#include "gpu/pipeline_builder.h" -#include "gpu/post_process_helper.h" -#include "gpu/sampler_cache.h" -#include "gpu/shader_composer.h" -#include "gpu/shaders.h" - -// Create custom pipeline with 5 bindings (includes original texture) -static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device, - WGPUTextureFormat format, - const char* shader_code) { - WGPUBindGroupLayout bgl = - BindGroupLayoutBuilder() - .sampler(0, WGPUShaderStage_Fragment) - .texture(1, WGPUShaderStage_Fragment) - .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment) - .uniform(3, WGPUShaderStage_Fragment) - .texture(4, WGPUShaderStage_Fragment) - .build(device); - - WGPURenderPipeline pipeline = RenderPipelineBuilder(device) - .shader(shader_code) - .bind_group_layout(bgl) - .format(format) - .build(); - - wgpuBindGroupLayoutRelease(bgl); - return pipeline; -} - -CNNEffect::CNNEffect(const GpuContext& ctx) - : PostProcessEffect(ctx), layer_index_(0), total_layers_(1), - blend_amount_(1.0f), input_view_(nullptr), original_view_(nullptr), - bind_group_(nullptr) { - pipeline_ = - create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl); -} - -CNNEffect::CNNEffect(const GpuContext& ctx, const CNNEffectParams& params) - : PostProcessEffect(ctx), layer_index_(params.layer_index), - total_layers_(params.total_layers), blend_amount_(params.blend_amount), - input_view_(nullptr), original_view_(nullptr), bind_group_(nullptr) { - pipeline_ = - create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl); -} - -void CNNEffect::init(MainSequence* demo) { - PostProcessEffect::init(demo); - demo_ = demo; - params_buffer_.init(ctx_.device); - - // Register auxiliary texture for layer 0 (width_/height_ set by resize()) - if (layer_index_ == 0) { - demo_->register_auxiliary_texture("captured_frame", width_, height_); - } - - // Initialize uniforms BEFORE any bind group creation - uniforms_.update(ctx_.queue, get_common_uniforms()); - - CNNLayerParams params = {layer_index_, blend_amount_, {0.0f, 0.0f}}; - params_buffer_.update(ctx_.queue, params); -} - -void CNNEffect::resize(int width, int height) { - if (width == width_ && height == height_) - return; - - PostProcessEffect::resize(width, height); - - // Only layer 0 owns the captured_frame texture - if (layer_index_ == 0 && demo_) { - demo_->resize_auxiliary_texture("captured_frame", width, height); - } -} - -void CNNEffect::render(WGPURenderPassEncoder pass, - const CommonPostProcessUniforms& uniforms) { - if (!bind_group_) { - fprintf(stderr, "CNN render: no bind_group\n"); - return; - } - - float effective_blend = blend_amount_; - if (beat_modulated_) { - effective_blend = blend_amount_ * uniforms.beat_phase * beat_scale_; - } - - CNNLayerParams params = {layer_index_, effective_blend, {0.0f, 0.0f}}; - params_buffer_.update(ctx_.queue, params); - - wgpuRenderPassEncoderSetPipeline(pass, pipeline_); - wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr); - wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); -} - -void CNNEffect::update_bind_group(WGPUTextureView input_view) { - input_view_ = input_view; - - // Update common uniforms (CRITICAL for UV calculation!) - uniforms_.update(ctx_.queue, get_common_uniforms()); - - // All layers: get captured frame (original input from layer 0) - if (demo_) { - original_view_ = demo_->get_auxiliary_view("captured_frame"); - } - - // Create bind group with original texture - if (bind_group_) - wgpuBindGroupRelease(bind_group_); - - WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_, 0); - // Use clamp (not repeat) to match PyTorch Conv2d zero-padding behavior - WGPUSampler sampler = - SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::clamp()); - - bind_group_ = - BindGroupBuilder() - .sampler(0, sampler) - .texture(1, input_view_) - .buffer(2, uniforms_.get().buffer, uniforms_.get().size) - .buffer(3, params_buffer_.get().buffer, params_buffer_.get().size) - .texture(4, original_view_ ? original_view_ : input_view_) - .build(ctx_.device, bgl); -} diff --git a/cnn_v1/src/cnn_effect.h b/cnn_v1/src/cnn_effect.h deleted file mode 100644 index cdcd656..0000000 --- a/cnn_v1/src/cnn_effect.h +++ /dev/null @@ -1,53 +0,0 @@ -// CNN post-processing effect header -// Multi-layer neural network stylization - -#pragma once -#include "gpu/effect.h" -#include "gpu/uniform_helper.h" - -struct CNNLayerParams { - int layer_index; - float blend_amount; // Blend: mix(input, output, blend_amount) - float _pad[2]; -}; -static_assert(sizeof(CNNLayerParams) == 16); - -struct CNNEffectParams { - int layer_index = 0; // Which layer to render (0-based) - int total_layers = 1; // Total number of layers in the CNN - float blend_amount = 1.0f; // Final blend with original input -}; - -class CNNEffect : public PostProcessEffect { - public: - explicit CNNEffect(const GpuContext& ctx); - explicit CNNEffect(const GpuContext& ctx, const CNNEffectParams& params); - - void init(MainSequence* demo) override; - void resize(int width, int height) override; - void render(WGPURenderPassEncoder pass, - const CommonPostProcessUniforms& uniforms) override; - void update_bind_group(WGPUTextureView input_view) override; - - // Layer 0 needs framebuffer capture for original input - bool needs_framebuffer_capture() const override { - return layer_index_ == 0; - } - - void set_beat_modulation(bool enabled, float scale = 1.0f) { - beat_modulated_ = enabled; - beat_scale_ = scale; - } - - private: - int layer_index_; - int total_layers_; - float blend_amount_; - bool beat_modulated_ = false; - float beat_scale_ = 1.0f; - WGPUTextureView input_view_; - WGPUTextureView original_view_; - UniformBuffer params_buffer_; - WGPUBindGroup bind_group_; - MainSequence* demo_ = nullptr; -}; diff --git a/cnn_v1/src/cnn_v1_effect.cc b/cnn_v1/src/cnn_v1_effect.cc new file mode 100644 index 0000000..1f44619 --- /dev/null +++ b/cnn_v1/src/cnn_v1_effect.cc @@ -0,0 +1,129 @@ +// CNN post-processing effect implementation +// Neural network-based stylization with modular WGSL + +#include "cnn_v1_effect.h" +#include "gpu/bind_group_builder.h" +#include "gpu/effect.h" +#include "gpu/pipeline_builder.h" +#include "gpu/post_process_helper.h" +#include "gpu/sampler_cache.h" +#include "gpu/shader_composer.h" +#include "gpu/shaders.h" + +// Create custom pipeline with 5 bindings (includes original texture) +static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device, + WGPUTextureFormat format, + const char* shader_code) { + WGPUBindGroupLayout bgl = + BindGroupLayoutBuilder() + .sampler(0, WGPUShaderStage_Fragment) + .texture(1, WGPUShaderStage_Fragment) + .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment) + .uniform(3, WGPUShaderStage_Fragment) + .texture(4, WGPUShaderStage_Fragment) + .build(device); + + WGPURenderPipeline pipeline = RenderPipelineBuilder(device) + .shader(shader_code) + .bind_group_layout(bgl) + .format(format) + .build(); + + wgpuBindGroupLayoutRelease(bgl); + return pipeline; +} + +CNNv1Effect::CNNv1Effect(const GpuContext& ctx) + : PostProcessEffect(ctx), layer_index_(0), total_layers_(1), + blend_amount_(1.0f), input_view_(nullptr), original_view_(nullptr), + bind_group_(nullptr) { + pipeline_ = + create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl); +} + +CNNv1Effect::CNNv1Effect(const GpuContext& ctx, const CNNv1EffectParams& params) + : PostProcessEffect(ctx), layer_index_(params.layer_index), + total_layers_(params.total_layers), blend_amount_(params.blend_amount), + input_view_(nullptr), original_view_(nullptr), bind_group_(nullptr) { + pipeline_ = + create_cnn_pipeline(ctx_.device, ctx_.format, cnn_layer_shader_wgsl); +} + +void CNNv1Effect::init(MainSequence* demo) { + PostProcessEffect::init(demo); + demo_ = demo; + params_buffer_.init(ctx_.device); + + // Register auxiliary texture for layer 0 (width_/height_ set by resize()) + if (layer_index_ == 0) { + demo_->register_auxiliary_texture("captured_frame", width_, height_); + } + + // Initialize uniforms BEFORE any bind group creation + uniforms_.update(ctx_.queue, get_common_uniforms()); + + CNNv1LayerParams params = {layer_index_, blend_amount_, {0.0f, 0.0f}}; + params_buffer_.update(ctx_.queue, params); +} + +void CNNv1Effect::resize(int width, int height) { + if (width == width_ && height == height_) + return; + + PostProcessEffect::resize(width, height); + + // Only layer 0 owns the captured_frame texture + if (layer_index_ == 0 && demo_) { + demo_->resize_auxiliary_texture("captured_frame", width, height); + } +} + +void CNNv1Effect::render(WGPURenderPassEncoder pass, + const CommonPostProcessUniforms& uniforms) { + if (!bind_group_) { + fprintf(stderr, "CNN render: no bind_group\n"); + return; + } + + float effective_blend = blend_amount_; + if (beat_modulated_) { + effective_blend = blend_amount_ * uniforms.beat_phase * beat_scale_; + } + + CNNv1LayerParams params = {layer_index_, effective_blend, {0.0f, 0.0f}}; + params_buffer_.update(ctx_.queue, params); + + wgpuRenderPassEncoderSetPipeline(pass, pipeline_); + wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_, 0, nullptr); + wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); +} + +void CNNv1Effect::update_bind_group(WGPUTextureView input_view) { + input_view_ = input_view; + + // Update common uniforms (CRITICAL for UV calculation!) + uniforms_.update(ctx_.queue, get_common_uniforms()); + + // All layers: get captured frame (original input from layer 0) + if (demo_) { + original_view_ = demo_->get_auxiliary_view("captured_frame"); + } + + // Create bind group with original texture + if (bind_group_) + wgpuBindGroupRelease(bind_group_); + + WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_, 0); + // Use clamp (not repeat) to match PyTorch Conv2d zero-padding behavior + WGPUSampler sampler = + SamplerCache::Get().get_or_create(ctx_.device, SamplerCache::clamp()); + + bind_group_ = + BindGroupBuilder() + .sampler(0, sampler) + .texture(1, input_view_) + .buffer(2, uniforms_.get().buffer, uniforms_.get().size) + .buffer(3, params_buffer_.get().buffer, params_buffer_.get().size) + .texture(4, original_view_ ? original_view_ : input_view_) + .build(ctx_.device, bgl); +} diff --git a/cnn_v1/src/cnn_v1_effect.h b/cnn_v1/src/cnn_v1_effect.h new file mode 100644 index 0000000..e820275 --- /dev/null +++ b/cnn_v1/src/cnn_v1_effect.h @@ -0,0 +1,53 @@ +// CNN post-processing effect header +// Multi-layer neural network stylization + +#pragma once +#include "gpu/effect.h" +#include "gpu/uniform_helper.h" + +struct CNNv1LayerParams { + int layer_index; + float blend_amount; // Blend: mix(input, output, blend_amount) + float _pad[2]; +}; +static_assert(sizeof(CNNv1LayerParams) == 16); + +struct CNNv1EffectParams { + int layer_index = 0; // Which layer to render (0-based) + int total_layers = 1; // Total number of layers in the CNN + float blend_amount = 1.0f; // Final blend with original input +}; + +class CNNv1Effect : public PostProcessEffect { + public: + explicit CNNv1Effect(const GpuContext& ctx); + explicit CNNv1Effect(const GpuContext& ctx, const CNNv1EffectParams& params); + + void init(MainSequence* demo) override; + void resize(int width, int height) override; + void render(WGPURenderPassEncoder pass, + const CommonPostProcessUniforms& uniforms) override; + void update_bind_group(WGPUTextureView input_view) override; + + // Layer 0 needs framebuffer capture for original input + bool needs_framebuffer_capture() const override { + return layer_index_ == 0; + } + + void set_beat_modulation(bool enabled, float scale = 1.0f) { + beat_modulated_ = enabled; + beat_scale_ = scale; + } + + private: + int layer_index_; + int total_layers_; + float blend_amount_; + bool beat_modulated_ = false; + float beat_scale_ = 1.0f; + WGPUTextureView input_view_; + WGPUTextureView original_view_; + UniformBuffer params_buffer_; + WGPUBindGroup bind_group_; + MainSequence* demo_ = nullptr; +}; -- cgit v1.2.3