From 61104d5b9e1774c11f0dba3b6d6018dabc2bce8f Mon Sep 17 00:00:00 2001 From: skal Date: Tue, 10 Feb 2026 16:44:39 +0100 Subject: feat: CNN RGBD→grayscale with 7-channel augmented input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Upgrade CNN architecture to process RGBD input, output grayscale, with 7-channel layer inputs (RGBD + UV coords + grayscale). Architecture changes: - Inner layers: Conv2d(7→4) output RGBD - Final layer: Conv2d(7→1) output grayscale - All inputs normalized to [-1,1] for tanh activation - Removed CoordConv2d in favor of unified 7-channel input Training (train_cnn.py): - SimpleCNN: 7→4 (inner), 7→1 (final) architecture - Forward: Normalize RGBD/coords/gray to [-1,1] - Weight export: array, 36> (inner), array, 9> (final) - Dataset: Load RGBA (RGBD) input Shaders (cnn_conv3x3.wgsl): - Added cnn_conv3x3_7to4: 7-channel input → RGBD output - Added cnn_conv3x3_7to1: 7-channel input → grayscale output - Both normalize inputs and use flattened weight arrays Documentation: - CNN_EFFECT.md: Updated architecture, training, weight format - CNN_RGBD_GRAYSCALE_SUMMARY.md: Implementation summary - HOWTO.md: Added training command example Next: Train with RGBD input data Co-Authored-By: Claude Sonnet 4.5 --- doc/CNN_EFFECT.md | 75 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 28 deletions(-) (limited to 'doc/CNN_EFFECT.md') diff --git a/doc/CNN_EFFECT.md b/doc/CNN_EFFECT.md index ae0f38a..b7d157f 100644 --- a/doc/CNN_EFFECT.md +++ b/doc/CNN_EFFECT.md @@ -21,27 +21,44 @@ Trainable convolutional neural network layers for artistic stylization (painterl ## Architecture -### Coordinate-Aware Layer 0 +### RGBD → Grayscale Pipeline -Layer 0 accepts normalized (x,y) patch center coordinates alongside RGBA samples: +**Input:** RGBD (RGB + inverse depth D=1/z) +**Output:** Grayscale (1 channel) +**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1] + +**Architecture:** +- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD +- **Final layer (N-1):** Conv2d(7→1) - output grayscale ```wgsl -fn cnn_conv3x3_with_coord( +// Inner layers: 7→4 (RGBD output) +fn cnn_conv3x3_7to4( tex: texture_2d, samp: sampler, - uv: vec2, # Center position [0,1] + uv: vec2, resolution: vec2, - rgba_weights: array, 9>, # 9 samples × 4×4 matrix - coord_weights: mat2x4, # 2 coords → 4 outputs - bias: vec4 + original: vec4, # Original RGBD [0,1] + weights: array, 36> # 9 pos × 4 out × (7 weights + bias) ) -> vec4 -``` -**Input structure:** 9 RGBA samples (36 values) + 1 xy coordinate (2 values) = 38 inputs → 4 outputs +// Final layer: 7→1 (grayscale output) +fn cnn_conv3x3_7to1( + tex: texture_2d, + samp: sampler, + uv: vec2, + resolution: vec2, + original: vec4, + weights: array, 9> # 9 pos × (7 weights + bias) +) -> f32 +``` -**Size impact:** +32B coord weights, kernel-agnostic +**Input normalization (all to [-1,1]):** +- RGBD: `(rgbd - 0.5) * 2` +- UV coords: `(uv - 0.5) * 2` +- Grayscale: `(0.2126*R + 0.7152*G + 0.0722*B - 0.5) * 2` -**Use cases:** Position-dependent stylization (vignettes, corner darkening, radial gradients) +**Activation:** tanh for inner layers, none for final layer ### Multi-Layer Architecture @@ -80,18 +97,15 @@ workspaces/main/shaders/cnn/ ### 1. Prepare Training Data Collect input/target image pairs: -- **Input:** Raw 3D render -- **Target:** Artistic style (hand-painted, filtered, stylized) +- **Input:** RGBA (RGB + depth as alpha channel, D=1/z) +- **Target:** Grayscale stylized output ```bash -training/input/img_000.png # Raw render -training/output/img_000.png # Stylized target +training/input/img_000.png # RGBA render (RGB + depth) +training/output/img_000.png # Grayscale target ``` -Use `image_style_processor.py` to generate targets: -```bash -python3 training/image_style_processor.py input/ output/ pencil_sketch -``` +**Note:** Input images must be RGBA where alpha = inverse depth (1/z) ### 2. Train Network @@ -245,20 +259,25 @@ Expands to: **Weight Storage:** -**Layer 0 (coordinate-aware):** +**Inner layers (7→4 RGBD output):** ```wgsl -const rgba_weights_layer0: array, 9> = array(...); -const coord_weights_layer0 = mat2x4( - 0.1, -0.2, 0.0, 0.0, # x-coord weights - -0.1, 0.0, 0.2, 0.0 # y-coord weights +// Structure: array, 36> +// 9 positions × 4 output channels, each with 7 weights + bias +const weights_layer0: array, 36> = array( + array(w0_r, w0_g, w0_b, w0_d, w0_u, w0_v, w0_gray, bias0), // pos0_ch0 + array(w1_r, w1_g, w1_b, w1_d, w1_u, w1_v, w1_gray, bias1), // pos0_ch1 + // ... 34 more entries ); -const bias_layer0 = vec4(0.0, 0.0, 0.0, 0.0); ``` -**Layers 1+ (standard):** +**Final layer (7→1 grayscale output):** ```wgsl -const weights_layer1: array, 9> = array(...); -const bias_layer1 = vec4(0.0, 0.0, 0.0, 0.0); +// Structure: array, 9> +// 9 positions, each with 7 weights + bias +const weights_layerN: array, 9> = array( + array(w0_r, w0_g, w0_b, w0_d, w0_u, w0_v, w0_gray, bias0), // pos0 + // ... 8 more entries +); ``` --- -- cgit v1.2.3