From 561d1dc446db7d1d3e02b92b43abedf1a5017850 Mon Sep 17 00:00:00 2001 From: skal Date: Fri, 13 Feb 2026 12:32:36 +0100 Subject: CNN v2: Refactor to uniform 12D→4D architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Architecture changes:** - Static features (8D): p0-p3 (parametric) + uv_x, uv_y, sin(10×uv_x), bias - Input RGBD (4D): fed separately to all layers - All layers: uniform 12D→4D (4 prev/input + 8 static → 4 output) - Bias integrated in static features (bias=False in PyTorch) **Weight calculations:** - 3 layers × (12 × 3×3 × 4) = 1296 weights - f16: 2.6 KB (vs old variable arch: ~6.4 KB) **Updated files:** *Training (Python):* - train_cnn_v2.py: Uniform model, takes input_rgbd + static_features - export_cnn_v2_weights.py: Binary export for storage buffers - export_cnn_v2_shader.py: Per-layer shader export (debugging) *Shaders (WGSL):* - cnn_v2_static.wgsl: p0-p3 parametric features (mips/gradients) - cnn_v2_compute.wgsl: 12D input, 4D output, vec4 packing *Tools:* - HTML tool (cnn_v2_test): Updated for 12D→4D, layer visualization *Docs:* - CNN_V2.md: Updated architecture, training, validation sections - HOWTO.md: Reference HTML tool for validation *Removed:* - validate_cnn_v2.sh: Obsolete (used CNN v1 tool) All code consistent with bias=False (bias in static features as 1.0). handoff(Claude): CNN v2 architecture finalized and documented --- workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl | 80 ++++++++++------------ 1 file changed, 36 insertions(+), 44 deletions(-) (limited to 'workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl') diff --git a/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl b/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl index 1e1704d..5c4b113 100644 --- a/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl +++ b/workspaces/main/shaders/cnn_v2/cnn_v2_compute.wgsl @@ -1,6 +1,6 @@ -// CNN v2 Compute Shader - Storage Buffer Version -// Processes single layer per dispatch with weights from storage buffer -// Multi-layer execution handled by C++ with ping-pong buffers +// CNN v2 Compute Shader - Uniform 12D→4D Architecture +// All layers: input/previous (4D) + static (8D) = 12D → 4 channels +// Storage buffer weights, ping-pong execution // Push constants for layer parameters (passed per dispatch) struct LayerParams { @@ -12,12 +12,12 @@ struct LayerParams { blend_amount: f32, // [0,1] blend with original } -@group(0) @binding(0) var static_features: texture_2d; // 8-channel static features -@group(0) @binding(1) var layer_input: texture_2d; // Previous layer output (8-channel packed) -@group(0) @binding(2) var output_tex: texture_storage_2d; // Current layer output +@group(0) @binding(0) var static_features: texture_2d; // 8D static features (p0-p3 + spatial) +@group(0) @binding(1) var layer_input: texture_2d; // 4D previous/input (RGBD or prev layer) +@group(0) @binding(2) var output_tex: texture_storage_2d; // 4D output @group(0) @binding(3) var weights_buffer: array; // Packed f16 weights @group(0) @binding(4) var params: LayerParams; -@group(0) @binding(5) var original_input: texture_2d; // Original RGB input for blending +@group(0) @binding(5) var original_input: texture_2d; // Original RGB for blending fn unpack_static_features(coord: vec2) -> array { let packed = textureLoad(static_features, coord, 0); @@ -28,21 +28,19 @@ fn unpack_static_features(coord: vec2) -> array { return array(v0.x, v0.y, v1.x, v1.y, v2.x, v2.y, v3.x, v3.y); } -fn unpack_layer_channels(coord: vec2) -> array { +fn unpack_layer_channels(coord: vec2) -> vec4 { let packed = textureLoad(layer_input, coord, 0); let v0 = unpack2x16float(packed.x); let v1 = unpack2x16float(packed.y); - let v2 = unpack2x16float(packed.z); - let v3 = unpack2x16float(packed.w); - return array(v0.x, v0.y, v1.x, v1.y, v2.x, v2.y, v3.x, v3.y); + return vec4(v0.x, v0.y, v1.x, v1.y); } -fn pack_channels(values: array) -> vec4 { +fn pack_channels(values: vec4) -> vec4 { return vec4( - pack2x16float(vec2(values[0], values[1])), - pack2x16float(vec2(values[2], values[3])), - pack2x16float(vec2(values[4], values[5])), - pack2x16float(vec2(values[6], values[7])) + pack2x16float(vec2(values.x, values.y)), + pack2x16float(vec2(values.z, values.w)), + 0u, // Unused + 0u // Unused ); } @@ -68,19 +66,19 @@ fn main(@builtin(global_invocation_id) id: vec3) { } let kernel_size = params.kernel_size; - let in_channels = params.in_channels; - let out_channels = params.out_channels; + let in_channels = params.in_channels; // Always 12 (4 prev + 8 static) + let out_channels = params.out_channels; // Always 4 let weight_offset = params.weight_offset; let is_output = params.is_output_layer != 0u; let kernel_radius = i32(kernel_size / 2u); - // Load static features (always 8D) + // Load static features (8D) and previous/input layer (4D) let static_feat = unpack_static_features(coord); - // Convolution per output channel - var output: array; - for (var c: u32 = 0u; c < out_channels && c < 8u; c++) { + // Convolution: 12D input → 4D output + var output: vec4 = vec4(0.0); + for (var c: u32 = 0u; c < 4u; c++) { var sum: f32 = 0.0; // Convolve over kernel @@ -94,55 +92,49 @@ fn main(@builtin(global_invocation_id) id: vec3) { clamp(sample_coord.y, 0, i32(dims.y) - 1) ); - // Load input features at this spatial location + // Load features at this spatial location let static_local = unpack_static_features(clamped); - let layer_local = unpack_layer_channels(clamped); + let layer_local = unpack_layer_channels(clamped); // 4D // Weight index calculation let ky_idx = u32(ky + kernel_radius); let kx_idx = u32(kx + kernel_radius); let spatial_idx = ky_idx * kernel_size + kx_idx; - // Accumulate: static features (always 8 channels) - for (var i: u32 = 0u; i < 8u; i++) { + // Accumulate: previous/input channels (4D) + for (var i: u32 = 0u; i < 4u; i++) { let w_idx = weight_offset + - c * in_channels * kernel_size * kernel_size + + c * 12u * kernel_size * kernel_size + i * kernel_size * kernel_size + spatial_idx; - sum += get_weight(w_idx) * static_local[i]; + sum += get_weight(w_idx) * layer_local[i]; } - // Accumulate: previous layer channels (in_channels - 8) - let prev_channels = in_channels - 8u; - for (var i: u32 = 0u; i < prev_channels && i < 8u; i++) { + // Accumulate: static features (8D) + for (var i: u32 = 0u; i < 8u; i++) { let w_idx = weight_offset + - c * in_channels * kernel_size * kernel_size + - (8u + i) * kernel_size * kernel_size + spatial_idx; - sum += get_weight(w_idx) * layer_local[i]; + c * 12u * kernel_size * kernel_size + + (4u + i) * kernel_size * kernel_size + spatial_idx; + sum += get_weight(w_idx) * static_local[i]; } } } // Activation if (is_output) { - output[c] = clamp(sum, 0.0, 1.0); // Sigmoid approximation + output[c] = clamp(sum, 0.0, 1.0); } else { output[c] = max(0.0, sum); // ReLU } } - // Zero unused channels - for (var c: u32 = out_channels; c < 8u; c++) { - output[c] = 0.0; - } - // Blend with original on final layer if (is_output) { let original = textureLoad(original_input, coord, 0).rgb; - let result_rgb = vec3(output[0], output[1], output[2]); + let result_rgb = vec3(output.x, output.y, output.z); let blended = mix(original, result_rgb, params.blend_amount); - output[0] = blended.r; - output[1] = blended.g; - output[2] = blended.b; + output.x = blended.r; + output.y = blended.g; + output.z = blended.b; } textureStore(output_tex, coord, pack_channels(output)); -- cgit v1.2.3