6 files changed, 100 insertions, 136 deletions
diff --git a/training/diagnose_255_to_253.md b/training/diagnose_255_to_253.md
deleted file mode 100644
index 764d328..0000000
--- a/training/diagnose_255_to_253.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Diagnosis: 255 → 253 Loss (-2 LSBs)
-
-## Findings
-
-### F16 Precision
-✅ **No loss:** 1.0 → f16(0x3c00) → 1.0 (exact round-trip)
-
-### Visualization Scale
-⚠️ **Inconsistent:**
-- Layer 1 uses `vizScale = 0.5` (line 1530)
-- Should render as 128, not 253
-- **User seeing 253 suggests viewing Static Features (scale=1.0), not CNN output**
-
-### Suspected Issue: Input Alpha Channel
-
-**Code:** `tools/cnn_v2_test/index.html` line 1233
-```javascript
-depthData[i] = pixels[i * 4 + 3] / 255.0;  // Alpha from canvas
-```
-
-**Hypothesis:** Input PNG alpha channel = 253 (not 255)
-- Browsers may set alpha < 255 for certain images
-- Pre-multiplied alpha corrections
-- PNG encoder compression artifacts
-
-### Test
-
-**Check input alpha:**
-```javascript
-// In HTML tool console:
-const canvas = document.createElement('canvas');
-canvas.width = tester.image.width;
-canvas.height = tester.image.height;
-const ctx = canvas.getContext('2d');
-ctx.drawImage(tester.image, 0, 0);
-const imgData = ctx.getImageData(0, 0, canvas.width, canvas.height);
-const alpha = imgData.data[3];  // First pixel alpha
-console.log('First pixel alpha:', alpha);
-```
-
-### Alternative: C++ Reference
-
-Check if `cnn_test` tool produces same -2 loss:
-```bash
-# Generate solid white 8×8 test image with alpha=255
-python3 -c "
-from PIL import Image
-import numpy as np
-img = np.ones((8, 8, 4), dtype=np.uint8) * 255
-Image.fromarray(img, 'RGBA').save('test_white_255.png')
-print('Created test_white_255.png: all pixels RGBA=(255,255,255,255)')
-"
-
-# Test with HTML tool → check if p3 = 1.0 or 0.9921875
-# Test with cnn_test → compare output
-./build/cnn_test test_white_255.png output.png --cnn-version 2 --debug-hex
-```
-
-### Next Steps
-
-1. **Verify input:** Check alpha channel of user's input image
-2. **Add debug:** Log first pixel RGBA values in HTML tool
-3. **Compare:** Run same image through C++ cnn_test
-4. **Isolate:** Test with synthetic 255 alpha image
-
-## Conclusion
-
-**Most likely:** Input image alpha ≠ 255, already 253 before CNN processing.
-**Verify:** User should check input PNG metadata and alpha channel values.
diff --git a/training/export_cnn_v2_weights.py b/training/export_cnn_v2_weights.py
index 1086516..f64bd8d 100755
--- a/training/export_cnn_v2_weights.py
+++ b/training/export_cnn_v2_weights.py
@@ -12,7 +12,7 @@ import struct
 from pathlib import Path
 
 
-def export_weights_binary(checkpoint_path, output_path):
+def export_weights_binary(checkpoint_path, output_path, quiet=False):
     """Export CNN v2 weights to binary format.
 
     Binary format:
@@ -40,7 +40,8 @@ def export_weights_binary(checkpoint_path, output_path):
     Returns:
         config dict for shader generation
     """
-    print(f"Loading checkpoint: {checkpoint_path}")
+    if not quiet:
+        print(f"Loading checkpoint: {checkpoint_path}")
     checkpoint = torch.load(checkpoint_path, map_location='cpu')
 
     state_dict = checkpoint['model_state_dict']
@@ -59,11 +60,12 @@ def export_weights_binary(checkpoint_path, output_path):
     num_layers = config.get('num_layers', len(kernel_sizes))
     mip_level = config.get('mip_level', 0)
 
-    print(f"Configuration:")
-    print(f"  Kernel sizes: {kernel_sizes}")
-    print(f"  Layers: {num_layers}")
-    print(f"  Mip level: {mip_level} (p0-p3 features)")
-    print(f"  Architecture: uniform 12D→4D (bias=False)")
+    if not quiet:
+        print(f"Configuration:")
+        print(f"  Kernel sizes: {kernel_sizes}")
+        print(f"  Layers: {num_layers}")
+        print(f"  Mip level: {mip_level} (p0-p3 features)")
+        print(f"  Architecture: uniform 12D→4D (bias=False)")
 
     # Collect layer info - all layers uniform 12D→4D
     layers = []
@@ -89,7 +91,8 @@ def export_weights_binary(checkpoint_path, output_path):
         all_weights.extend(layer_flat)
         weight_offset += len(layer_flat)
 
-        print(f"  Layer {i}: 12D→4D, {kernel_size}×{kernel_size}, {len(layer_flat)} weights")
+        if not quiet:
+            print(f"  Layer {i}: 12D→4D, {kernel_size}×{kernel_size}, {len(layer_flat)} weights")
 
     # Convert to f16
     # TODO: Use 8-bit quantization for 2× size reduction
@@ -104,11 +107,13 @@ def export_weights_binary(checkpoint_path, output_path):
     # Pack pairs using numpy view
     weights_u32 = all_weights_f16.view(np.uint32)
 
-    print(f"\nWeight statistics:")
-    print(f"  Total layers: {len(layers)}")
-    print(f"  Total weights: {len(all_weights_f16)} (f16)")
-    print(f"  Packed: {len(weights_u32)} u32")
-    print(f"  Binary size: {20 + len(layers) * 20 + len(weights_u32) * 4} bytes")
+    binary_size = 20 + len(layers) * 20 + len(weights_u32) * 4
+    if not quiet:
+        print(f"\nWeight statistics:")
+        print(f"  Total layers: {len(layers)}")
+        print(f"  Total weights: {len(all_weights_f16)} (f16)")
+        print(f"  Packed: {len(weights_u32)} u32")
+        print(f"  Binary size: {binary_size} bytes")
 
     # Write binary file
     output_path = Path(output_path)
@@ -135,7 +140,10 @@ def export_weights_binary(checkpoint_path, output_path):
         # Weights (u32 packed f16 pairs)
         f.write(weights_u32.tobytes())
 
-    print(f"  → {output_path}")
+    if quiet:
+        print(f"  Exported {num_layers} layers, {len(all_weights_f16)} weights, {binary_size} bytes → {output_path}")
+    else:
+        print(f"  → {output_path}")
 
     return {
         'num_layers': len(layers),
@@ -257,15 +265,19 @@ def main():
                         help='Output binary weights file')
     parser.add_argument('--output-shader', type=str, default='workspaces/main/shaders',
                         help='Output directory for shader template')
+    parser.add_argument('--quiet', action='store_true',
+                        help='Suppress detailed output')
 
     args = parser.parse_args()
 
-    print("=== CNN v2 Weight Export ===\n")
-    config = export_weights_binary(args.checkpoint, args.output_weights)
-    print()
-    # Shader is manually maintained in cnn_v2_compute.wgsl
-    # export_shader_template(config, args.output_shader)
-    print("\nExport complete!")
+    if not args.quiet:
+        print("=== CNN v2 Weight Export ===\n")
+    config = export_weights_binary(args.checkpoint, args.output_weights, quiet=args.quiet)
+    if not args.quiet:
+        print()
+        # Shader is manually maintained in cnn_v2_compute.wgsl
+        # export_shader_template(config, args.output_shader)
+        print("\nExport complete!")
 
 
 if __name__ == '__main__':
diff --git a/training/gen_identity_weights.py b/training/gen_identity_weights.py
index a84ea87..7865d68 100755
--- a/training/gen_identity_weights.py
+++ b/training/gen_identity_weights.py
@@ -4,8 +4,16 @@
 Creates trivial .bin with 1 layer, 1×1 kernel, identity passthrough.
 Output Ch{0,1,2,3} = Input Ch{0,1,2,3} (ignores static features).
 
+With --mix: Output Ch{i} = 0.5*prev[i] + 0.5*static_p{4+i}
+  (50-50 blend of prev layer with uv_x, uv_y, sin20_y, bias)
+
+With --p47: Output Ch{i} = static p{4+i} (uv_x, uv_y, sin20_y, bias)
+  (p4/uv_x→ch0, p5/uv_y→ch1, p6/sin20_y→ch2, p7/bias→ch3)
+
 Usage:
   ./training/gen_identity_weights.py [output.bin]
+  ./training/gen_identity_weights.py --mix [output.bin]
+  ./training/gen_identity_weights.py --p47 [output.bin]
 """
 
 import argparse
@@ -14,9 +22,15 @@ import struct
 from pathlib import Path
 
 
-def generate_identity_weights(output_path, kernel_size=1, mip_level=0):
+def generate_identity_weights(output_path, kernel_size=1, mip_level=0, mix=False, p47=False):
     """Generate identity weights: output = input (ignores static features).
 
+    If mix=True, 50-50 blend: 0.5*p0+0.5*p4, 0.5*p1+0.5*p5, etc (avoids overflow).
+    If p47=True, transfers static p4-p7 (uv_x, uv_y, sin20_y, bias) to output channels.
+
+    Input channel layout: [0-3: prev layer, 4-11: static (p0-p7)]
+    Static features: p0-p3 (RGB+D), p4 (uv_x), p5 (uv_y), p6 (sin20_y), p7 (bias)
+
     Binary format:
       Header (20 bytes):
         uint32 magic ('CNN2')
@@ -34,7 +48,8 @@ def generate_identity_weights(output_path, kernel_size=1, mip_level=0):
 
       Weights (u32 packed f16):
         Identity matrix for first 4 input channels
-        Zeros for static features (channels 4-11)
+        Zeros for static features (channels 4-11) OR
+        Mix matrix (p0+p4, p1+p5, p2+p6, p3+p7) if mix=True
     """
     # Identity: 4 output channels, 12 input channels
     # Weight shape: [out_ch, in_ch, kernel_h, kernel_w]
@@ -47,19 +62,37 @@ def generate_identity_weights(output_path, kernel_size=1, mip_level=0):
     # Center position for kernel
     center = kernel_size // 2
 
-    # Set diagonal to 1.0 (output ch i = input ch i)
-    for i in range(out_channels):
-        weights[i, i, center, center] = 1.0
+    if p47:
+        # p47 mode: p4→ch0, p5→ch1, p6→ch2, p7→ch3 (static features only)
+        # Input channels: [0-3: prev layer, 4-11: static features (p0-p7)]
+        # p4-p7 are at input channels 8-11
+        for i in range(out_channels):
+            weights[i, i + 8, center, center] = 1.0
+    elif mix:
+        # Mix mode: 50-50 blend (p0+p4, p1+p5, p2+p6, p3+p7)
+        # p0-p3 are at channels 0-3 (prev layer), p4-p7 at channels 8-11 (static)
+        for i in range(out_channels):
+            weights[i, i, center, center] = 0.5       # 0.5*p{i} (prev layer)
+            weights[i, i + 8, center, center] = 0.5   # 0.5*p{i+4} (static)
+    else:
+        # Identity: output ch i = input ch i
+        for i in range(out_channels):
+            weights[i, i, center, center] = 1.0
 
     # Flatten
     weights_flat = weights.flatten()
     weight_count = len(weights_flat)
 
-    print(f"Generating identity weights:")
+    mode_name = 'p47' if p47 else ('mix' if mix else 'identity')
+    print(f"Generating {mode_name} weights:")
     print(f"  Kernel size: {kernel_size}×{kernel_size}")
     print(f"  Channels: 12D→4D")
     print(f"  Weights: {weight_count}")
     print(f"  Mip level: {mip_level}")
+    if mix:
+        print(f"  Mode: 0.5*prev[i] + 0.5*static_p{{4+i}} (blend with uv/sin/bias)")
+    elif p47:
+        print(f"  Mode: p4→ch0, p5→ch1, p6→ch2, p7→ch3")
 
     # Convert to f16
     weights_f16 = np.array(weights_flat, dtype=np.float16)
@@ -122,11 +155,15 @@ def main():
                        help='Kernel size (default: 1×1)')
     parser.add_argument('--mip-level', type=int, default=0,
                        help='Mip level for p0-p3 features (default: 0)')
+    parser.add_argument('--mix', action='store_true',
+                       help='Mix mode: 50-50 blend of p0-p3 and p4-p7')
+    parser.add_argument('--p47', action='store_true',
+                       help='Static features only: p4→ch0, p5→ch1, p6→ch2, p7→ch3')
 
     args = parser.parse_args()
 
     print("=== Identity Weight Generator ===\n")
-    generate_identity_weights(args.output, args.kernel_size, args.mip_level)
+    generate_identity_weights(args.output, args.kernel_size, args.mip_level, args.mix, args.p47)
     print("\nDone!")
 
 
diff --git a/training/layers/chk_10000_3x5x3x3.pt b/training/layers/chk_10000_3x5x3x3.pt
new file mode 100644
index 0000000..6e6750c
--- /dev/null
+++ b/training/layers/chk_10000_3x5x3x3.pt
diff --git a/training/test_viz_precision.py b/training/test_viz_precision.py
deleted file mode 100755
index 143f4ea..0000000
--- a/training/test_viz_precision.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-"""Test WebGPU → Canvas → PNG precision loss
-
-Check if bgra8unorm → 2D canvas → PNG loses 2 LSBs.
-"""
-
-import numpy as np
-
-# Simulate WebGPU bgra8unorm conversion
-# Float [0, 1] → uint8 [0, 255]
-
-test_values = [
-    1.0,      # Perfect white
-    0.9999,   # Near-white
-    254.5/255,  # Exactly 254.5
-    253.5/255,  # Exactly 253.5
-]
-
-for val in test_values:
-    # WebGPU bgra8unorm: round(val * 255)
-    gpu_u8 = int(np.round(val * 255))
-
-    # Convert back to normalized
-    gpu_f32 = gpu_u8 / 255.0
-
-    # JavaScript canvas getImageData: uint8
-    canvas_u8 = int(np.round(gpu_f32 * 255))
-
-    print(f"Input: {val:.6f} → GPU u8: {gpu_u8} → Canvas: {canvas_u8}")
-    if canvas_u8 != 255:
-        print(f"  ⚠️ Lost {255 - canvas_u8} LSBs")
-
-print("\nConclusion:")
-print("If WebGPU stores 1.0 as 255, canvas should read 255.")
-print("If user sees 253, likely:")
-print("  a) Not viewing CNN layer (viewing static features at scale=1.0)")
-print("  b) Value in texture is already 253/255 = 0.9921875")
-print("  c) F16 storage or unpacking issue")
diff --git a/training/train_cnn_v2.py b/training/train_cnn_v2.py
index 134a5ae..9e5df2f 100755
--- a/training/train_cnn_v2.py
+++ b/training/train_cnn_v2.py
@@ -121,7 +121,7 @@ class CNNv2(nn.Module):
         # Layer 0: input RGBD (4D) + static (8D) = 12D
         x = torch.cat([input_rgbd, static_features], dim=1)
         x = self.layers[0](x)
-        x = torch.clamp(x, 0, 1)  # Output [0,1] for layer 0
+        x = torch.sigmoid(x)  # Soft [0,1] for layer 0
 
         # Layer 1+: previous (4D) + static (8D) = 12D
         for i in range(1, self.num_layers):
@@ -130,7 +130,7 @@ class CNNv2(nn.Module):
             if i < self.num_layers - 1:
                 x = F.relu(x)
             else:
-                x = torch.clamp(x, 0, 1)  # Final output [0,1]
+                x = torch.sigmoid(x)  # Soft [0,1] for final layer
 
         return x
 
@@ -329,6 +329,9 @@ def train(args):
     kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
     if len(kernel_sizes) == 1:
         kernel_sizes = kernel_sizes * args.num_layers
+    else:
+        # When multiple kernel sizes provided, derive num_layers from list length
+        args.num_layers = len(kernel_sizes)
 
     # Create model
     model = CNNv2(kernel_sizes=kernel_sizes, num_layers=args.num_layers).to(device)
@@ -397,6 +400,25 @@ def train(args):
             }, checkpoint_path)
             print(f"  → Saved checkpoint: {checkpoint_path}")
 
+    # Always save final checkpoint
+    print()  # Newline after training
+    final_checkpoint = Path(args.checkpoint_dir) / f"checkpoint_epoch_{args.epochs}.pth"
+    final_checkpoint.parent.mkdir(parents=True, exist_ok=True)
+    torch.save({
+            'epoch': args.epochs,
+            'model_state_dict': model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'loss': avg_loss,
+            'config': {
+                'kernel_sizes': kernel_sizes,
+                'num_layers': args.num_layers,
+                'mip_level': args.mip_level,
+                'grayscale_loss': args.grayscale_loss,
+                'features': ['p0', 'p1', 'p2', 'p3', 'uv.x', 'uv.y', 'sin20_y', 'bias']
+            }
+    }, final_checkpoint)
+    print(f"  → Saved final checkpoint: {final_checkpoint}")
+
     print(f"\nTraining complete! Total time: {time.time() - start_time:.1f}s")
     return model