5 files changed, 373 insertions, 154 deletions
diff --git a/doc/CNN_EFFECT.md b/doc/CNN_EFFECT.md
index ae0f38a..b7d157f 100644
--- a/doc/CNN_EFFECT.md
+++ b/doc/CNN_EFFECT.md
@@ -21,27 +21,44 @@ Trainable convolutional neural network layers for artistic stylization (painterl
 
 ## Architecture
 
-### Coordinate-Aware Layer 0
+### RGBD → Grayscale Pipeline
 
-Layer 0 accepts normalized (x,y) patch center coordinates alongside RGBA samples:
+**Input:** RGBD (RGB + inverse depth D=1/z)
+**Output:** Grayscale (1 channel)
+**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1]
+
+**Architecture:**
+- **Inner layers (0..N-2):** Conv2d(7→4) - output RGBD
+- **Final layer (N-1):** Conv2d(7→1) - output grayscale
 
 ```wgsl
-fn cnn_conv3x3_with_coord(
+// Inner layers: 7→4 (RGBD output)
+fn cnn_conv3x3_7to4(
   tex: texture_2d<f32>,
   samp: sampler,
-  uv: vec2<f32>,                          # Center position [0,1]
+  uv: vec2<f32>,
   resolution: vec2<f32>,
-  rgba_weights: array<mat4x4<f32>, 9>,    # 9 samples × 4×4 matrix
-  coord_weights: mat2x4<f32>,             # 2 coords → 4 outputs
-  bias: vec4<f32>
+  original: vec4<f32>,                     # Original RGBD [0,1]
+  weights: array<array<f32, 8>, 36>       # 9 pos × 4 out × (7 weights + bias)
 ) -> vec4<f32>
-```
 
-**Input structure:** 9 RGBA samples (36 values) + 1 xy coordinate (2 values) = 38 inputs → 4 outputs
+// Final layer: 7→1 (grayscale output)
+fn cnn_conv3x3_7to1(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 9>        # 9 pos × (7 weights + bias)
+) -> f32
+```
 
-**Size impact:** +32B coord weights, kernel-agnostic
+**Input normalization (all to [-1,1]):**
+- RGBD: `(rgbd - 0.5) * 2`
+- UV coords: `(uv - 0.5) * 2`
+- Grayscale: `(0.2126*R + 0.7152*G + 0.0722*B - 0.5) * 2`
 
-**Use cases:** Position-dependent stylization (vignettes, corner darkening, radial gradients)
+**Activation:** tanh for inner layers, none for final layer
 
 ### Multi-Layer Architecture
 
@@ -80,18 +97,15 @@ workspaces/main/shaders/cnn/
 ### 1. Prepare Training Data
 
 Collect input/target image pairs:
-- **Input:** Raw 3D render
-- **Target:** Artistic style (hand-painted, filtered, stylized)
+- **Input:** RGBA (RGB + depth as alpha channel, D=1/z)
+- **Target:** Grayscale stylized output
 
 ```bash
-training/input/img_000.png   # Raw render
-training/output/img_000.png  # Stylized target
+training/input/img_000.png   # RGBA render (RGB + depth)
+training/output/img_000.png  # Grayscale target
 ```
 
-Use `image_style_processor.py` to generate targets:
-```bash
-python3 training/image_style_processor.py input/ output/ pencil_sketch
-```
+**Note:** Input images must be RGBA where alpha = inverse depth (1/z)
 
 ### 2. Train Network
 
@@ -245,20 +259,25 @@ Expands to:
 
 **Weight Storage:**
 
-**Layer 0 (coordinate-aware):**
+**Inner layers (7→4 RGBD output):**
 ```wgsl
-const rgba_weights_layer0: array<mat4x4<f32>, 9> = array(...);
-const coord_weights_layer0 = mat2x4<f32>(
-  0.1, -0.2, 0.0, 0.0,  # x-coord weights
-  -0.1, 0.0, 0.2, 0.0   # y-coord weights
+// Structure: array<array<f32, 8>, 36>
+// 9 positions × 4 output channels, each with 7 weights + bias
+const weights_layer0: array<array<f32, 8>, 36> = array(
+  array<f32, 8>(w0_r, w0_g, w0_b, w0_d, w0_u, w0_v, w0_gray, bias0),  // pos0_ch0
+  array<f32, 8>(w1_r, w1_g, w1_b, w1_d, w1_u, w1_v, w1_gray, bias1),  // pos0_ch1
+  // ... 34 more entries
 );
-const bias_layer0 = vec4<f32>(0.0, 0.0, 0.0, 0.0);
 ```
 
-**Layers 1+ (standard):**
+**Final layer (7→1 grayscale output):**
 ```wgsl
-const weights_layer1: array<mat4x4<f32>, 9> = array(...);
-const bias_layer1 = vec4<f32>(0.0, 0.0, 0.0, 0.0);
+// Structure: array<array<f32, 8>, 9>
+// 9 positions, each with 7 weights + bias
+const weights_layerN: array<array<f32, 8>, 9> = array(
+  array<f32, 8>(w0_r, w0_g, w0_b, w0_d, w0_u, w0_v, w0_gray, bias0),  // pos0
+  // ... 8 more entries
+);
 ```
 
 ---
diff --git a/doc/CNN_RGBD_GRAYSCALE_SUMMARY.md b/doc/CNN_RGBD_GRAYSCALE_SUMMARY.md
new file mode 100644
index 0000000..4c13693
--- /dev/null
+++ b/doc/CNN_RGBD_GRAYSCALE_SUMMARY.md
@@ -0,0 +1,134 @@
+# CNN RGBD→Grayscale Architecture Implementation
+
+## Summary
+
+Implemented CNN architecture upgrade: RGBD input → grayscale output with 7-channel augmented input.
+
+## Changes Made
+
+### Architecture
+
+**Input:** RGBD (4 channels: RGB + inverse depth D=1/z)
+**Output:** Grayscale (1 channel)
+**Layer Input:** 7 channels = [RGBD, UV coords, grayscale] all normalized to [-1,1]
+
+**Layer Configuration:**
+- Inner layers (0..N-2): Conv2d(7→4) - output RGBD with tanh activation
+- Final layer (N-1): Conv2d(7→1) - output grayscale, no activation
+
+### Input Normalization (all to [-1,1])
+
+- **RGBD:** `(rgbd - 0.5) * 2`
+- **UV coords:** `(uv - 0.5) * 2`
+- **Grayscale:** `(0.2126*R + 0.7152*G + 0.0722*B - 0.5) * 2`
+
+**Rationale:** Zero-centered inputs for tanh activation, better gradient flow.
+
+### Modified Files
+
+**Training (`/Users/skal/demo/training/train_cnn.py`):**
+1. Removed `CoordConv2d` class
+2. Updated `SimpleCNN`:
+   - Inner layers: `Conv2d(7, 4)` - RGBD output
+   - Final layer: `Conv2d(7, 1)` - grayscale output
+3. Updated `forward()`:
+   - Normalize RGBD/coords/gray to [-1,1]
+   - Concatenate 7-channel input for each layer
+   - Apply tanh (inner) or none (final)
+   - Denormalize final output
+4. Updated `export_weights_to_wgsl()`:
+   - Inner: `array<array<f32, 8>, 36>` (9 pos × 4 ch × 8 values)
+   - Final: `array<array<f32, 8>, 9>` (9 pos × 8 values)
+5. Updated `generate_layer_shader()`:
+   - Use `cnn_conv3x3_7to4` for inner layers
+   - Use `cnn_conv3x3_7to1` for final layer
+   - Denormalize outputs from [-1,1] to [0,1]
+6. Updated `ImagePairDataset`:
+   - Load RGBA input (was RGB)
+
+**Shaders (`/Users/skal/demo/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl`):**
+1. Added `cnn_conv3x3_7to4()`:
+   - 7-channel input: [RGBD, uv_x, uv_y, gray]
+   - 4-channel output: RGBD
+   - Weights: `array<array<f32, 8>, 36>`
+2. Added `cnn_conv3x3_7to1()`:
+   - 7-channel input: [RGBD, uv_x, uv_y, gray]
+   - 1-channel output: grayscale
+   - Weights: `array<array<f32, 8>, 9>`
+
+**Documentation (`/Users/skal/demo/doc/CNN_EFFECT.md`):**
+1. Updated architecture section with RGBD→grayscale pipeline
+2. Updated training data requirements (RGBA input)
+3. Updated weight storage format
+
+### No C++ Changes
+
+CNNLayerParams and bind groups remain unchanged.
+
+## Data Flow
+
+1. Layer 0 captures original RGBD to `captured_frame`
+2. Each layer:
+   - Samples previous layer output (RGBD in [0,1])
+   - Normalizes RGBD to [-1,1]
+   - Computes UV coords and grayscale, normalizes to [-1,1]
+   - Concatenates 7-channel input
+   - Applies convolution with layer-specific weights
+   - Outputs RGBD (inner) or grayscale (final) in [-1,1]
+   - Applies tanh (inner only)
+   - Denormalizes to [0,1] for texture storage
+   - Blends with original
+
+## Next Steps
+
+1. **Prepare RGBD training data:**
+   - Input: RGBA images (RGB + depth in alpha)
+   - Target: Grayscale stylized output
+
+2. **Train network:**
+   ```bash
+   python3 training/train_cnn.py \
+     --input training/input \
+     --target training/output \
+     --layers 3 \
+     --epochs 1000
+   ```
+
+3. **Verify generated shaders:**
+   - Check `cnn_weights_generated.wgsl` structure
+   - Check `cnn_layer.wgsl` uses new conv functions
+
+4. **Test in demo:**
+   ```bash
+   cmake --build build -j4
+   ./build/demo64k
+   ```
+
+## Design Rationale
+
+**Why [-1,1] normalization?**
+- Centered inputs for tanh (operates best around 0)
+- Better gradient flow
+- Standard ML practice for normalized data
+
+**Why RGBD throughout vs RGB?**
+- Depth information propagates through network
+- Enables depth-aware stylization
+- Consistent 4-channel processing
+
+**Why 7-channel input?**
+- Coordinates: position-dependent effects (vignettes)
+- Grayscale: luminance-aware processing
+- RGBD: full color+depth information
+- Enables richer feature learning
+
+## Testing Checklist
+
+- [ ] Train network with RGBD input data
+- [ ] Verify `cnn_weights_generated.wgsl` structure
+- [ ] Verify `cnn_layer.wgsl` uses `7to4`/`7to1` functions
+- [ ] Build demo without errors
+- [ ] Visual test: inner layers show RGBD evolution
+- [ ] Visual test: final layer produces grayscale
+- [ ] Visual test: blending works correctly
+- [ ] Compare quality with previous RGB→RGB architecture
diff --git a/doc/HOWTO.md b/doc/HOWTO.md
index bdc0214..2c813f7 100644
--- a/doc/HOWTO.md
+++ b/doc/HOWTO.md
@@ -86,6 +86,14 @@ make run_util_tests     # Utility tests
 
 ---
 
+## Training 
+
+```bash
+./training/train_cnn.py --layers 3 --kernel_sizes 3,5,3 --epochs 10000 --batch_size 8 --input training/input/ --target training/output/ --checkpoint-every 1000
+```
+
+---
+
 ## Timeline
 
 Edit `workspaces/main/timeline.seq`:
diff --git a/training/train_cnn.py b/training/train_cnn.py
index 1cd6579..0495c65 100755
--- a/training/train_cnn.py
+++ b/training/train_cnn.py
@@ -62,7 +62,8 @@ class ImagePairDataset(Dataset):
     def __getitem__(self, idx):
         input_path, target_path = self.image_pairs[idx]
 
-        input_img = Image.open(input_path).convert('RGB')
+        # Load RGBD input (4 channels: RGB + Depth)
+        input_img = Image.open(input_path).convert('RGBA')
         target_img = Image.open(target_path).convert('RGB')
 
         if self.transform:
@@ -72,27 +73,8 @@ class ImagePairDataset(Dataset):
         return input_img, target_img
 
 
-class CoordConv2d(nn.Module):
-    """Conv2d that accepts coordinate input separate from spatial patches"""
-
-    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
-        super().__init__()
-        self.conv_rgba = nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, bias=False)
-        self.coord_weights = nn.Parameter(torch.randn(out_channels, 2) * 0.01)
-        self.bias = nn.Parameter(torch.zeros(out_channels))
-
-    def forward(self, x, coords):
-        # x: [B, C, H, W] image
-        # coords: [B, 2, H, W] coordinate grid
-        out = self.conv_rgba(x)
-        B, C, H, W = out.shape
-        coord_contrib = torch.einsum('bchw,oc->bohw', coords, self.coord_weights)
-        out = out + coord_contrib + self.bias.view(1, -1, 1, 1)
-        return out
-
-
 class SimpleCNN(nn.Module):
-    """Simple CNN for image-to-image transformation"""
+    """CNN for RGBD→grayscale with 7-channel input (RGBD + UV + gray)"""
 
     def __init__(self, num_layers=1, kernel_sizes=None):
         super(SimpleCNN, self).__init__()
@@ -107,26 +89,48 @@ class SimpleCNN(nn.Module):
 
         for i, kernel_size in enumerate(kernel_sizes):
             padding = kernel_size // 2
-            if i == 0:
-                self.layers.append(CoordConv2d(3, 3, kernel_size, padding=padding))
+            if i < num_layers - 1:
+                # Inner layers: 7→4 (RGBD output)
+                self.layers.append(nn.Conv2d(7, 4, kernel_size=kernel_size, padding=padding, bias=True))
             else:
-                self.layers.append(nn.Conv2d(3, 3, kernel_size=kernel_size, padding=padding, bias=True))
+                # Final layer: 7→1 (grayscale output)
+                self.layers.append(nn.Conv2d(7, 1, kernel_size=kernel_size, padding=padding, bias=True))
 
     def forward(self, x):
+        # x: [B,4,H,W] - RGBD input (D = 1/z)
         B, C, H, W = x.shape
+
+        # Normalize RGBD to [-1,1]
+        x_norm = (x - 0.5) * 2.0
+
+        # Compute coordinates [0,1] then normalize to [-1,1]
         y_coords = torch.linspace(0, 1, H, device=x.device).view(1,1,H,1).expand(B,1,H,W)
         x_coords = torch.linspace(0, 1, W, device=x.device).view(1,1,1,W).expand(B,1,H,W)
-        coords = torch.cat([x_coords, y_coords], dim=1)
+        y_coords = (y_coords - 0.5) * 2.0  # [-1,1]
+        x_coords = (x_coords - 0.5) * 2.0  # [-1,1]
 
-        out = self.layers[0](x, coords)
-        out = torch.tanh(out)
+        # Compute grayscale from original RGB (Rec.709) and normalize to [-1,1]
+        gray = 0.2126*x[:,0:1] + 0.7152*x[:,1:2] + 0.0722*x[:,2:3]  # [B,1,H,W] in [0,1]
+        gray = (gray - 0.5) * 2.0  # [-1,1]
 
-        for i in range(1, len(self.layers)):
-            out = self.layers[i](out)
-            if i < len(self.layers) - 1:
-                out = torch.tanh(out)
+        # Layer 0
+        layer0_input = torch.cat([x_norm, x_coords, y_coords, gray], dim=1)  # [B,7,H,W]
+        out = self.layers[0](layer0_input)  # [B,4,H,W]
+        out = torch.tanh(out)  # [-1,1]
 
-        return out
+        # Inner layers
+        for i in range(1, len(self.layers)-1):
+            layer_input = torch.cat([out, x_coords, y_coords, gray], dim=1)
+            out = self.layers[i](layer_input)
+            out = torch.tanh(out)
+
+        # Final layer (grayscale output)
+        final_input = torch.cat([out, x_coords, y_coords, gray], dim=1)
+        out = self.layers[-1](final_input)  # [B,1,H,W] in [-1,1]
+
+        # Denormalize to [0,1] and expand to RGB for visualization
+        out = (out + 1.0) * 0.5
+        return out.expand(-1, 3, -1, -1)
 
 
 def generate_layer_shader(output_path, num_layers, kernel_sizes):
@@ -169,25 +173,35 @@ def generate_layer_shader(output_path, num_layers, kernel_sizes):
 
         # Generate layer switches
         for layer_idx in range(num_layers):
-            ks = kernel_sizes[layer_idx]
+            is_final = layer_idx == num_layers - 1
             if layer_idx == 0:
-                f.write(f"    // Layer 0 uses coordinate-aware convolution\n")
+                f.write(f"    // Layer 0: 7→4 (RGBD output)\n")
                 f.write(f"    if (params.layer_index == {layer_idx}) {{\n")
-                f.write(f"        result = cnn_conv{ks}x{ks}_with_coord(txt, smplr, uv, uniforms.resolution,\n")
-                f.write(f"                                        rgba_weights_layer{layer_idx}, coord_weights_layer{layer_idx}, bias_layer{layer_idx});\n")
-                f.write(f"        result = cnn_tanh(result);\n")
+                f.write(f"        result = cnn_conv3x3_7to4(txt, smplr, uv, uniforms.resolution,\n")
+                f.write(f"                                   original, weights_layer{layer_idx});\n")
+                f.write(f"        result = cnn_tanh(result);  // Output in [-1,1]\n")
+                f.write(f"        // Denormalize to [0,1] for texture storage\n")
+                f.write(f"        result = (result + 1.0) * 0.5;\n")
+                f.write(f"    }}\n")
+            elif not is_final:
+                f.write(f"    else if (params.layer_index == {layer_idx}) {{\n")
+                f.write(f"        result = cnn_conv3x3_7to4(txt, smplr, uv, uniforms.resolution,\n")
+                f.write(f"                                   original, weights_layer{layer_idx});\n")
+                f.write(f"        result = cnn_tanh(result);  // Output in [-1,1]\n")
+                f.write(f"        // Denormalize to [0,1] for texture storage\n")
+                f.write(f"        result = (result + 1.0) * 0.5;\n")
                 f.write(f"    }}\n")
             else:
-                is_last = layer_idx == num_layers - 1
-                f.write(f"    {'else ' if layer_idx > 0 else ''}if (params.layer_index == {layer_idx}) {{\n")
-                f.write(f"        result = cnn_conv{ks}x{ks}(txt, smplr, uv, uniforms.resolution,\n")
-                f.write(f"                                   weights_layer{layer_idx}, bias_layer{layer_idx});\n")
-                if not is_last:
-                    f.write(f"        result = cnn_tanh(result);\n")
+                f.write(f"    else if (params.layer_index == {layer_idx}) {{\n")
+                f.write(f"        let gray_out = cnn_conv3x3_7to1(txt, smplr, uv, uniforms.resolution,\n")
+                f.write(f"                                         original, weights_layer{layer_idx});\n")
+                f.write(f"        // Denormalize from [-1,1] to [0,1]\n")
+                f.write(f"        let gray_01 = (gray_out + 1.0) * 0.5;\n")
+                f.write(f"        result = vec4<f32>(gray_01, gray_01, gray_01, 1.0);  // Expand to RGB\n")
                 f.write(f"    }}\n")
 
         # Add else clause for invalid layer index
-        if num_layers > 1:
+        if num_layers > 0:
             f.write(f"    else {{\n")
             f.write(f"        result = input;\n")
             f.write(f"    }}\n")
@@ -204,96 +218,40 @@ def export_weights_to_wgsl(model, output_path, kernel_sizes):
         f.write("// Auto-generated CNN weights\n")
         f.write("// DO NOT EDIT - Generated by train_cnn.py\n\n")
 
-        layer_idx = 0
         for i, layer in enumerate(model.layers):
-            if isinstance(layer, CoordConv2d):
-                # Export RGBA weights
-                weights = layer.conv_rgba.weight.data.cpu().numpy()
-                kernel_size = kernel_sizes[layer_idx]
-                out_ch, in_ch, kh, kw = weights.shape
-                num_positions = kh * kw
-
-                f.write(f"const rgba_weights_layer{layer_idx}: array<mat4x4<f32>, {num_positions}> = array(\n")
-                for pos in range(num_positions):
-                    row = pos // kw
-                    col = pos % kw
-                    f.write("  mat4x4<f32>(\n")
-                    for out_c in range(4):
-                        vals = []
-                        for in_c in range(4):
-                            if out_c < out_ch and in_c < in_ch:
-                                vals.append(f"{weights[out_c, in_c, row, col]:.6f}")
-                            else:
-                                vals.append("0.0")
-                        f.write(f"    {', '.join(vals)},\n")
-                    f.write("  )")
-                    if pos < num_positions - 1:
-                        f.write(",\n")
-                    else:
-                        f.write("\n")
-                f.write(");\n\n")
+            weights = layer.weight.data.cpu().numpy()
+            bias = layer.bias.data.cpu().numpy()
+            out_ch, in_ch, kh, kw = weights.shape
+            num_positions = kh * kw
 
-                # Export coordinate weights
-                coord_w = layer.coord_weights.data.cpu().numpy()
-                f.write(f"const coord_weights_layer{layer_idx} = mat2x4<f32>(\n")
-                for c in range(2):
-                    vals = []
-                    for out_c in range(4):
-                        if out_c < coord_w.shape[0]:
-                            vals.append(f"{coord_w[out_c, c]:.6f}")
-                        else:
-                            vals.append("0.0")
-                    f.write(f"  {', '.join(vals)}")
-                    if c < 1:
-                        f.write(",\n")
-                    else:
-                        f.write("\n")
-                f.write(");\n\n")
+            is_final = (i == len(model.layers) - 1)
 
-                # Export bias
-                bias = layer.bias.data.cpu().numpy()
-                bias_vals = [f"{bias[i]:.6f}" if i < len(bias) else "0.0" for i in range(4)]
-                f.write(f"const bias_layer{layer_idx} = vec4<f32>(")
-                f.write(", ".join(bias_vals))
+            if is_final:
+                # Final layer: 7→1, structure: array<array<f32, 8>, 9>
+                # [w0, w1, w2, w3, w4, w5, w6, bias]
+                f.write(f"const weights_layer{i}: array<array<f32, 8>, {num_positions}> = array(\n")
+                for pos in range(num_positions):
+                    row, col = pos // kw, pos % kw
+                    vals = [f"{weights[0, in_c, row, col]:.6f}" for in_c in range(7)]
+                    vals.append(f"{bias[0]:.6f}")  # Append bias as 8th element
+                    f.write(f"  array<f32, 8>({', '.join(vals)})")
+                    f.write(",\n" if pos < num_positions-1 else "\n")
                 f.write(");\n\n")
-
-                layer_idx += 1
-            elif isinstance(layer, nn.Conv2d):
-                # Standard conv layer
-                weights = layer.weight.data.cpu().numpy()
-                kernel_size = kernel_sizes[layer_idx]
-                out_ch, in_ch, kh, kw = weights.shape
-                num_positions = kh * kw
-
-                f.write(f"const weights_layer{layer_idx}: array<mat4x4<f32>, {num_positions}> = array(\n")
+            else:
+                # Inner layers: 7→4, structure: array<array<f32, 8>, 36>
+                # Flattened: [pos0_ch0[7w+bias], pos0_ch1[7w+bias], ..., pos8_ch3[7w+bias]]
+                num_entries = num_positions * 4
+                f.write(f"const weights_layer{i}: array<array<f32, 8>, {num_entries}> = array(\n")
                 for pos in range(num_positions):
-                    row = pos // kw
-                    col = pos % kw
-                    f.write("  mat4x4<f32>(\n")
+                    row, col = pos // kw, pos % kw
                     for out_c in range(4):
-                        vals = []
-                        for in_c in range(4):
-                            if out_c < out_ch and in_c < in_ch:
-                                vals.append(f"{weights[out_c, in_c, row, col]:.6f}")
-                            else:
-                                vals.append("0.0")
-                        f.write(f"    {', '.join(vals)},\n")
-                    f.write("  )")
-                    if pos < num_positions - 1:
-                        f.write(",\n")
-                    else:
-                        f.write("\n")
+                        vals = [f"{weights[out_c, in_c, row, col]:.6f}" for in_c in range(7)]
+                        vals.append(f"{bias[out_c]:.6f}")  # Append bias
+                        idx = pos * 4 + out_c
+                        f.write(f"  array<f32, 8>({', '.join(vals)})")
+                        f.write(",\n" if idx < num_entries-1 else "\n")
                 f.write(");\n\n")
 
-                # Export bias
-                bias = layer.bias.data.cpu().numpy()
-                bias_vals = [f"{bias[i]:.6f}" if i < len(bias) else "0.0" for i in range(4)]
-                f.write(f"const bias_layer{layer_idx} = vec4<f32>(")
-                f.write(", ".join(bias_vals))
-                f.write(");\n\n")
-
-                layer_idx += 1
-
 
 def train(args):
     """Main training loop"""
diff --git a/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl b/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl
index 168c9e2..df58b4d 100644
--- a/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl
+++ b/workspaces/main/shaders/cnn/cnn_conv3x3.wgsl
@@ -51,3 +51,103 @@ fn cnn_conv3x3_with_coord(
 
   return sum;
 }
+
+// Inner layers: 7→4 channels (RGBD output)
+// weights: array<array<f32, 8>, 36> (9 positions × 4 channels, each with 7 weights + bias)
+fn cnn_conv3x3_7to4(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 36>
+) -> vec4<f32> {
+  let step = 1.0 / resolution;
+
+  // Compute grayscale from original and normalize to [-1,1]
+  let gray_01 = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+  let gray = (gray_01 - 0.5) * 2.0;
+
+  // Normalize UV to [-1,1]
+  let uv_norm = (uv - 0.5) * 2.0;
+
+  var sum = vec4<f32>(0.0);
+
+  var pos = 0;
+  for (var dy = -1; dy <= 1; dy++) {
+    for (var dx = -1; dx <= 1; dx++) {
+      let offset = vec2<f32>(f32(dx), f32(dy)) * step;
+      let rgbd_01 = textureSample(tex, samp, uv + offset);
+
+      // Normalize RGBD to [-1,1]
+      let rgbd = (rgbd_01 - 0.5) * 2.0;
+
+      // 7-channel input: [R,G,B,D, uv.x, uv.y, gray] all in [-1,1]
+      let inputs = array<f32, 7>(
+        rgbd.r, rgbd.g, rgbd.b, rgbd.a,
+        uv_norm.x, uv_norm.y, gray
+      );
+
+      // Accumulate for each output channel (RGBD)
+      for (var out_c = 0; out_c < 4; out_c++) {
+        let idx = pos * 4 + out_c;
+        var channel_sum = weights[idx][7];  // Bias (8th element)
+        for (var in_c = 0; in_c < 7; in_c++) {
+          channel_sum += weights[idx][in_c] * inputs[in_c];
+        }
+        sum[out_c] += channel_sum;
+      }
+
+      pos++;
+    }
+  }
+
+  return sum;  // Output in [-1,1] range
+}
+
+// Final layer: 7→1 channel (scalar output)
+// weights: array<array<f32, 8>, 9> (9 positions, each with 7 weights + bias)
+fn cnn_conv3x3_7to1(
+  tex: texture_2d<f32>,
+  samp: sampler,
+  uv: vec2<f32>,
+  resolution: vec2<f32>,
+  original: vec4<f32>,
+  weights: array<array<f32, 8>, 9>
+) -> f32 {
+  let step = 1.0 / resolution;
+
+  // Normalize grayscale to [-1,1]
+  let gray_01 = 0.2126*original.r + 0.7152*original.g + 0.0722*original.b;
+  let gray = (gray_01 - 0.5) * 2.0;
+
+  // Normalize UV to [-1,1]
+  let uv_norm = (uv - 0.5) * 2.0;
+
+  var sum = 0.0;
+
+  var pos = 0;
+  for (var dy = -1; dy <= 1; dy++) {
+    for (var dx = -1; dx <= 1; dx++) {
+      let offset = vec2<f32>(f32(dx), f32(dy)) * step;
+      let rgbd_01 = textureSample(tex, samp, uv + offset);
+
+      // Normalize RGBD to [-1,1]
+      let rgbd = (rgbd_01 - 0.5) * 2.0;
+
+      // 7-channel input all in [-1,1]
+      sum += weights[pos][0] * rgbd.r;
+      sum += weights[pos][1] * rgbd.g;
+      sum += weights[pos][2] * rgbd.b;
+      sum += weights[pos][3] * rgbd.a;
+      sum += weights[pos][4] * uv_norm.x;
+      sum += weights[pos][5] * uv_norm.y;
+      sum += weights[pos][6] * gray;
+      sum += weights[pos][7];  // Bias
+
+      pos++;
+    }
+  }
+
+  return sum;  // Output in [-1,1], needs denormalization
+}