From 561d1dc446db7d1d3e02b92b43abedf1a5017850 Mon Sep 17 00:00:00 2001
From: skal <pascal.massimino@gmail.com>
Date: Fri, 13 Feb 2026 12:32:36 +0100
Subject: CNN v2: Refactor to uniform 12D→4D architecture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Architecture changes:**
- Static features (8D): p0-p3 (parametric) + uv_x, uv_y, sin(10×uv_x), bias
- Input RGBD (4D): fed separately to all layers
- All layers: uniform 12D→4D (4 prev/input + 8 static → 4 output)
- Bias integrated in static features (bias=False in PyTorch)

**Weight calculations:**
- 3 layers × (12 × 3×3 × 4) = 1296 weights
- f16: 2.6 KB (vs old variable arch: ~6.4 KB)

**Updated files:**

*Training (Python):*
- train_cnn_v2.py: Uniform model, takes input_rgbd + static_features
- export_cnn_v2_weights.py: Binary export for storage buffers
- export_cnn_v2_shader.py: Per-layer shader export (debugging)

*Shaders (WGSL):*
- cnn_v2_static.wgsl: p0-p3 parametric features (mips/gradients)
- cnn_v2_compute.wgsl: 12D input, 4D output, vec4 packing

*Tools:*
- HTML tool (cnn_v2_test): Updated for 12D→4D, layer visualization

*Docs:*
- CNN_V2.md: Updated architecture, training, validation sections
- HOWTO.md: Reference HTML tool for validation

*Removed:*
- validate_cnn_v2.sh: Obsolete (used CNN v1 tool)

All code consistent with bias=False (bias in static features as 1.0).

handoff(Claude): CNN v2 architecture finalized and documented
---
 workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl | 80 ++++++++++------------
 1 file changed, 36 insertions(+), 44 deletions(-)

(limited to 'workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl')
diff --git a/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl b/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl
index 1e1704d..5c4b113 100644
--- a/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl
+++ b/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl
@@ -1,6 +1,6 @@
-// CNN v2 Compute Shader - Storage Buffer Version
-// Processes single layer per dispatch with weights from storage buffer
-// Multi-layer execution handled by C++ with ping-pong buffers
+// CNN v2 Compute Shader - Uniform 12D→4D Architecture
+// All layers: input/previous (4D) + static (8D) = 12D → 4 channels
+// Storage buffer weights, ping-pong execution
 
 // Push constants for layer parameters (passed per dispatch)
 struct LayerParams {
@@ -12,12 +12,12 @@ struct LayerParams {
   blend_amount: f32,      // [0,1] blend with original
 }
 
-@group(0) @binding(0) var static_features: texture_2d<u32>;       // 8-channel static features
-@group(0) @binding(1) var layer_input: texture_2d<u32>;           // Previous layer output (8-channel packed)
-@group(0) @binding(2) var output_tex: texture_storage_2d<rgba32uint, write>;  // Current layer output
+@group(0) @binding(0) var static_features: texture_2d<u32>;       // 8D static features (p0-p3 + spatial)
+@group(0) @binding(1) var layer_input: texture_2d<u32>;           // 4D previous/input (RGBD or prev layer)
+@group(0) @binding(2) var output_tex: texture_storage_2d<rgba32uint, write>;  // 4D output
 @group(0) @binding(3) var<storage, read> weights_buffer: array<u32>;  // Packed f16 weights
 @group(0) @binding(4) var<uniform> params: LayerParams;
-@group(0) @binding(5) var original_input: texture_2d<f32>;        // Original RGB input for blending
+@group(0) @binding(5) var original_input: texture_2d<f32>;        // Original RGB for blending
 
 fn unpack_static_features(coord: vec2<i32>) -> array<f32, 8> {
   let packed = textureLoad(static_features, coord, 0);
@@ -28,21 +28,19 @@ fn unpack_static_features(coord: vec2<i32>) -> array<f32, 8> {
   return array<f32, 8>(v0.x, v0.y, v1.x, v1.y, v2.x, v2.y, v3.x, v3.y);
 }
 
-fn unpack_layer_channels(coord: vec2<i32>) -> array<f32, 8> {
+fn unpack_layer_channels(coord: vec2<i32>) -> vec4<f32> {
   let packed = textureLoad(layer_input, coord, 0);
   let v0 = unpack2x16float(packed.x);
   let v1 = unpack2x16float(packed.y);
-  let v2 = unpack2x16float(packed.z);
-  let v3 = unpack2x16float(packed.w);
-  return array<f32, 8>(v0.x, v0.y, v1.x, v1.y, v2.x, v2.y, v3.x, v3.y);
+  return vec4<f32>(v0.x, v0.y, v1.x, v1.y);
 }
 
-fn pack_channels(values: array<f32, 8>) -> vec4<u32> {
+fn pack_channels(values: vec4<f32>) -> vec4<u32> {
   return vec4<u32>(
-    pack2x16float(vec2<f32>(values[0], values[1])),
-    pack2x16float(vec2<f32>(values[2], values[3])),
-    pack2x16float(vec2<f32>(values[4], values[5])),
-    pack2x16float(vec2<f32>(values[6], values[7]))
+    pack2x16float(vec2<f32>(values.x, values.y)),
+    pack2x16float(vec2<f32>(values.z, values.w)),
+    0u,  // Unused
+    0u   // Unused
   );
 }
 
@@ -68,19 +66,19 @@ fn main(@builtin(global_invocation_id) id: vec3<u32>) {
   }
 
   let kernel_size = params.kernel_size;
-  let in_channels = params.in_channels;
-  let out_channels = params.out_channels;
+  let in_channels = params.in_channels;  // Always 12 (4 prev + 8 static)
+  let out_channels = params.out_channels;  // Always 4
   let weight_offset = params.weight_offset;
   let is_output = params.is_output_layer != 0u;
 
   let kernel_radius = i32(kernel_size / 2u);
 
-  // Load static features (always 8D)
+  // Load static features (8D) and previous/input layer (4D)
   let static_feat = unpack_static_features(coord);
 
-  // Convolution per output channel
-  var output: array<f32, 8>;
-  for (var c: u32 = 0u; c < out_channels && c < 8u; c++) {
+  // Convolution: 12D input → 4D output
+  var output: vec4<f32> = vec4<f32>(0.0);
+  for (var c: u32 = 0u; c < 4u; c++) {
     var sum: f32 = 0.0;
 
     // Convolve over kernel
@@ -94,55 +92,49 @@ fn main(@builtin(global_invocation_id) id: vec3<u32>) {
           clamp(sample_coord.y, 0, i32(dims.y) - 1)
         );
 
-        // Load input features at this spatial location
+        // Load features at this spatial location
         let static_local = unpack_static_features(clamped);
-        let layer_local = unpack_layer_channels(clamped);
+        let layer_local = unpack_layer_channels(clamped);  // 4D
 
         // Weight index calculation
         let ky_idx = u32(ky + kernel_radius);
         let kx_idx = u32(kx + kernel_radius);
         let spatial_idx = ky_idx * kernel_size + kx_idx;
 
-        // Accumulate: static features (always 8 channels)
-        for (var i: u32 = 0u; i < 8u; i++) {
+        // Accumulate: previous/input channels (4D)
+        for (var i: u32 = 0u; i < 4u; i++) {
           let w_idx = weight_offset +
-                     c * in_channels * kernel_size * kernel_size +
+                     c * 12u * kernel_size * kernel_size +
                      i * kernel_size * kernel_size + spatial_idx;
-          sum += get_weight(w_idx) * static_local[i];
+          sum += get_weight(w_idx) * layer_local[i];
         }
 
-        // Accumulate: previous layer channels (in_channels - 8)
-        let prev_channels = in_channels - 8u;
-        for (var i: u32 = 0u; i < prev_channels && i < 8u; i++) {
+        // Accumulate: static features (8D)
+        for (var i: u32 = 0u; i < 8u; i++) {
           let w_idx = weight_offset +
-                     c * in_channels * kernel_size * kernel_size +
-                     (8u + i) * kernel_size * kernel_size + spatial_idx;
-          sum += get_weight(w_idx) * layer_local[i];
+                     c * 12u * kernel_size * kernel_size +
+                     (4u + i) * kernel_size * kernel_size + spatial_idx;
+          sum += get_weight(w_idx) * static_local[i];
         }
       }
     }
 
     // Activation
     if (is_output) {
-      output[c] = clamp(sum, 0.0, 1.0);  // Sigmoid approximation
+      output[c] = clamp(sum, 0.0, 1.0);
     } else {
       output[c] = max(0.0, sum);  // ReLU
     }
   }
 
-  // Zero unused channels
-  for (var c: u32 = out_channels; c < 8u; c++) {
-    output[c] = 0.0;
-  }
-
   // Blend with original on final layer
   if (is_output) {
     let original = textureLoad(original_input, coord, 0).rgb;
-    let result_rgb = vec3<f32>(output[0], output[1], output[2]);
+    let result_rgb = vec3<f32>(output.x, output.y, output.z);
     let blended = mix(original, result_rgb, params.blend_amount);
-    output[0] = blended.r;
-    output[1] = blended.g;
-    output[2] = blended.b;
+    output.x = blended.r;
+    output.y = blended.g;
+    output.z = blended.b;
   }
 
   textureStore(output_tex, coord, pack_channels(output));
-- 
cgit v1.2.3