3 files changed, 753 insertions, 127 deletions
diff --git a/doc/CNN_TEST_TOOL.md b/doc/CNN_TEST_TOOL.md
index e7d679e..ee0d9c5 100644
--- a/doc/CNN_TEST_TOOL.md
+++ b/doc/CNN_TEST_TOOL.md
@@ -1,31 +1,37 @@
 # CNN Shader Testing Tool
 
-Standalone tool for validating trained CNN shaders with GPU-to-CPU readback.
+Standalone tool for validating trained CNN shaders with GPU-to-CPU readback. Supports both CNN v1 (render pipeline) and v2 (compute, storage buffer).
 
 ---
 
 ## Purpose
 
-- Validate trained weights (`cnn_weights_generated.wgsl`) against ground truth
+- Validate trained weights against ground truth
 - Debug CNN layer behavior in isolation
-- Generate test outputs for patch-based training workflow
-- Match Python training script's inference mode (`train_cnn.py --infer`)
+- Generate test outputs for training workflow
+- Match Python training script's inference mode
 
 ---
 
 ## Architecture
 
-**Two-part implementation:**
+**Two implementations:**
 
-1. **Core GPU utility:** `src/gpu/texture_readback.{h,cc}` (~150 lines)
-   - Synchronous texture-to-CPU readback
-   - Reusable for screenshots, validation, video export
-   - Protected with STRIP_ALL (0 bytes in release builds)
+1. **CNN v1** (render pipeline, texture atlas weights)
+   - 3 fixed layers
+   - RGBA16Float intermediates
+   - BGRA8Unorm final output
 
-2. **Standalone tool:** `tools/cnn_test.cc` (~450 lines)
-   - Custom CNN inference pipeline
-   - No MainSequence dependency
-   - Asset-based shader loading with automatic include resolution
+2. **CNN v2** (compute shaders, storage buffer weights)
+   - Dynamic layer count from binary
+   - 7D static features (RGBD + UV + sin + bias)
+   - RGBA32Uint packed f16 intermediates
+   - Storage buffer: ~3-5 KB weights
+
+**Core GPU utility:** `src/gpu/texture_readback.{h,cc}`
+- Synchronous texture-to-CPU readback
+- Supports RGBA16Float, RGBA32Uint, BGRA8Unorm
+- Protected with STRIP_ALL (0 bytes in release)
 
 ---
 
@@ -35,24 +41,28 @@ Standalone tool for validating trained CNN shaders with GPU-to-CPU readback.
 cnn_test input.png output.png [OPTIONS]
 
 OPTIONS:
-  --blend F         Final blend amount (0.0-1.0, default: 1.0)
-  --format ppm|png  Output format (default: png)
-  --help            Show usage
+  --cnn-version N          CNN version: 1 (default) or 2
+  --blend F                Final blend amount (0.0-1.0, default: 1.0)
+  --format ppm|png         Output format (default: png)
+  --layers N               Number of CNN layers (1-10, v1 only, default: 3)
+  --save-intermediates DIR Save intermediate layers to directory
+  --debug-hex              Print first 8 pixels as hex (debug)
+  --help                   Show usage
 ```
 
 **Examples:**
 ```bash
-# Full CNN processing
-./build/cnn_test input.png output.png
+# CNN v1 (render pipeline, 3 layers)
+./build/cnn_test input.png output.png --cnn-version 1
 
-# 50% blend with original
-./build/cnn_test input.png output.png --blend 0.5
+# CNN v2 (compute, storage buffer, dynamic layers)
+./build/cnn_test input.png output.png --cnn-version 2
 
-# No CNN effect (original passthrough)
-./build/cnn_test input.png output.png --blend 0.0
+# 50% blend with original (v2)
+./build/cnn_test input.png output.png --cnn-version 2 --blend 0.5
 
-# PPM output format
-./build/cnn_test input.png output.ppm --format ppm
+# Debug hex dump
+./build/cnn_test input.png output.png --cnn-version 2 --debug-hex
 ```
 
 ---
@@ -90,25 +100,24 @@ std::vector<uint8_t> OffscreenRenderTarget::read_pixels() {
 }
 ```
 
-### CNN Processing Pipeline
+### CNN v1 Pipeline (Render)
 
-**Fixed 3-layer architecture** (matches trained CNN):
-1. Layer 0: Initial convolution
-2. Layer 1: Intermediate convolution
-3. Layer 2: Final convolution + blend with original
+**Fixed 3-layer architecture:**
+- Ping-pong RGBA16Float textures
+- CNNLayerParams (binding 3): layer_index, blend_amount
+- Shader composer resolves #include directives
 
-**Ping-pong textures:**
-- 2 intermediate render targets
-- 1 original input reference (binding 4)
+### CNN v2 Pipeline (Compute)
 
-**Uniforms:**
-- `CommonPostProcessUniforms` (binding 2): resolution, aspect_ratio, time, beat, audio_intensity
-- `CNNLayerParams` (binding 3): layer_index, blend_amount
+**Dynamic layer architecture:**
+1. **Static features compute:** Generate 7D features (RGBD + UV + sin + bias)
+2. **Layer computes:** N layers from binary weights (3-5 typically)
+   - Storage buffer weights (read-only)
+   - RGBA32Uint packed f16 textures (ping-pong)
+   - CNNv2LayerParams: kernel_size, channels, weight_offset, blend
+3. **Readback:** RGBA32Uint → f16 decode → u8 clamp
 
-**Shader composition:**
-- Uses `ShaderComposer::Get()` via `RenderPipelineBuilder`
-- Automatically resolves `#include` directives
-- Registers CNN snippets: activation, conv3×3, conv5×5, weights
+**Binary format:** Header (20B) + layer info (20B×N) + f16 weights
 
 ---
 
@@ -144,51 +153,34 @@ cmake --build build -j4
 
 ---
 
-## Validation Workflow
+## Validation Workflow (CNN v2)
 
-### 1. Ground Truth Generation
+### 1. Train and Export
 ```bash
-# Generate ground truth from Python
-./training/train_cnn.py --infer test.png \
-  --export-only training/checkpoints/checkpoint_epoch_5000.pth \
-  --output ground_truth.png
+# Train and export weights
+./scripts/train_cnn_v2_full.sh --epochs 200 --batch-size 16
 ```
 
 ### 2. Tool Inference
 ```bash
-# Run tool (always 3 layers, matching trained CNN)
-./build/cnn_test test.png tool_output.png --blend 1.0
+# Run tool with v2
+./build/cnn_test training/input/img_000.png output.png --cnn-version 2
 ```
 
-### 3. Comparison
-```bash
-# Compare (MSE should be low)
-python -c "
-import numpy as np
-from PIL import Image
-gt = np.array(Image.open('ground_truth.png'))
-out = np.array(Image.open('tool_output.png'))
-mse = np.mean((gt.astype(float) - out.astype(float)) ** 2)
-print(f'MSE: {mse:.4f}')
-assert mse < 10.0, f'MSE too high: {mse}'
-"
-```
+### 3. Visual Comparison
+Compare output.png with training/target_X/img_000.png
 
 ---
 
-## Known Issues
+## Status
 
-**BUG: CNN produces incorrect output (all white)**
-- Readback works correctly (see Technical Notes below)
-- Shader compiles and executes without errors
-- Output is all white (255) regardless of input or blend setting
-- **Likely causes:**
-  - Uniform buffer layout mismatch between C++ and WGSL
-  - Texture binding issue (input not sampled correctly)
-  - Weight matrix initialization problem
-- CNNEffect works correctly in demo (visual validation confirms)
-- **Status:** Under investigation - rendering pipeline differs from demo's CNNEffect
-- **Workaround:** Use CNNEffect visual validation in demo until tool fixed
+**CNN v1:** Builds and runs, produces incorrect output (all white). Use CNNEffect in demo for visual validation.
+
+**CNN v2:** ✅ Fully functional. Tested and working.
+- Loads binary weights from `workspaces/main/weights/cnn_v2_weights.bin`
+- Matches CNNv2Effect architecture
+- Produces correct output
+- Recommended for validation
 
 ---
 
@@ -214,41 +206,25 @@ assert mse < 10.0, f'MSE too high: {mse}'
 
 ## Limitations
 
-- **Fixed layer count:** Cannot run partial networks (3 layers hardcoded)
+- **CNN v1:** Produces incorrect output, use for debugging only
 - **Single image:** Batch processing requires shell loop
 - **No real-time preview:** Offline processing only
-- **PNG input only:** Uses stb_image (JPEG/PNG/BMP/TGA supported)
-
----
-
-## Future Enhancements
-
-- Batch processing (directory input)
-- Interactive preview mode
-- Per-layer weight inspection
-- Checksum validation against training checkpoints
-- CUDA/Metal direct backends (bypass WebGPU overhead)
+- **PNG input:** stb_image (JPEG/PNG/BMP/TGA also supported)
 
 ---
 
 ## Technical Notes
 
-**Number of layers is fixed by trained CNN architecture:**
-- Defined in `cnn_weights_generated.wgsl`
-- Cannot meaningfully run partial networks (layer outputs have different formats/ranges)
-- Tool always processes full 3-layer stack
-
-**Blend parameter:**
-- Applied only to final layer (layer 2)
-- Intermediate layers always use blend=1.0
-- `mix(input, cnn_output, blend_amount)` in shader
+**CNN v2 f16 decoding:**
+- RGBA32Uint texture stores 8×f16 as 4×u32
+- Custom decoder: extract u16, decode f16→f32, clamp [0,1]→u8
+- Handles denormals, infinity, NaN
 
 **Cross-platform:**
-- Tested on macOS (native WebGPU)
-- Builds on Windows via mingw-w64 cross-compile
-- Linux support via native WebGPU
+- macOS, Linux (native WebGPU)
+- Windows (mingw-w64 cross-compile)
 
 **Size impact:**
-- Debug/STRIP_ALL=OFF: ~150 lines compiled
-- STRIP_ALL=ON: 0 bytes (entirely compiled out)
-- FINAL_STRIP=ON: 0 bytes (tool not built)
+- Debug/STRIP_ALL=OFF: compiled
+- STRIP_ALL=ON: 0 bytes (compiled out)
+- FINAL_STRIP=ON: tool not built
diff --git a/doc/HOWTO.md b/doc/HOWTO.md
index 1e8b58b..85ce801 100644
--- a/doc/HOWTO.md
+++ b/doc/HOWTO.md
@@ -264,40 +264,24 @@ See `doc/ASSET_SYSTEM.md` and `doc/WORKSPACE_SYSTEM.md`.
 
 ### Offline Shader Validation
 
-**Note:** Tool builds and runs but produces incorrect output. Use CNNEffect visual validation in demo. See `doc/CNN_TEST_TOOL.md`.
-
 ```bash
-# Test trained CNN on PNG input
-./build/cnn_test input.png output.png
-
-# Adjust blend amount (0.0 = original, 1.0 = full CNN)
-./build/cnn_test input.png output.png --blend 0.5
-
-# PPM output format
-./build/cnn_test input.png output.ppm --format ppm
-```
+# CNN v2 (recommended, fully functional)
+./build/cnn_test input.png output.png --cnn-version 2
 
-### Ground Truth Comparison
-```bash
-# Generate Python ground truth
-./training/train_cnn.py --infer input.png \
-  --export-only checkpoints/checkpoint_epoch_1000.pth \
-  --output ground_truth.png
+# CNN v1 (produces incorrect output, debug only)
+./build/cnn_test input.png output.png --cnn-version 1
 
-# Run tool
-./build/cnn_test input.png tool_output.png
+# Adjust blend (0.0 = original, 1.0 = full CNN)
+./build/cnn_test input.png output.png --cnn-version 2 --blend 0.5
 
-# Compare (Python required)
-python3 -c "
-import numpy as np
-from PIL import Image
-gt = np.array(Image.open('ground_truth.png').convert('RGB'))
-out = np.array(Image.open('tool_output.png').convert('RGB'))
-mse = np.mean((gt.astype(float) - out.astype(float)) ** 2)
-print(f'MSE: {mse:.4f} (target: < 10.0)')
-"
+# Debug hex dump (first 8 pixels)
+./build/cnn_test input.png output.png --cnn-version 2 --debug-hex
 ```
 
+**Status:**
+- **CNN v2:** ✅ Fully functional, matches CNNv2Effect
+- **CNN v1:** ⚠️ Produces incorrect output, use CNNEffect in demo for validation
+
 See `doc/CNN_TEST_TOOL.md` for full documentation.
 
 ---
diff --git a/tools/cnn_test.cc b/tools/cnn_test.cc
index c2983a9..5823110 100644
--- a/tools/cnn_test.cc
+++ b/tools/cnn_test.cc
@@ -28,6 +28,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <vector>
+#include <cmath>
 
 // Helper to get asset string or empty string
 static const char* SafeGetAsset(AssetId id) {
@@ -44,6 +45,7 @@ struct Args {
   const char* save_intermediates = nullptr;
   int num_layers = 3; // Default to 3 layers
   bool debug_hex = false; // Print first 8 pixels as hex
+  int cnn_version = 1; // 1=CNNEffect, 2=CNNv2Effect
 };
 
 // Parse command-line arguments
@@ -83,6 +85,12 @@ static bool parse_args(int argc, char** argv, Args* args) {
       }
     } else if (strcmp(argv[i], "--debug-hex") == 0) {
       args->debug_hex = true;
+    } else if (strcmp(argv[i], "--cnn-version") == 0 && i + 1 < argc) {
+      args->cnn_version = atoi(argv[++i]);
+      if (args->cnn_version < 1 || args->cnn_version > 2) {
+        fprintf(stderr, "Error: cnn-version must be 1 or 2\n");
+        return false;
+      }
     } else if (strcmp(argv[i], "--help") == 0) {
       return false;
     } else {
@@ -103,6 +111,7 @@ static void print_usage(const char* prog) {
   fprintf(stderr, "  --layers N               Number of CNN layers (1-10, default: 3)\n");
   fprintf(stderr, "  --save-intermediates DIR Save intermediate layers to directory\n");
   fprintf(stderr, "  --debug-hex              Print first 8 pixels as hex (debug)\n");
+  fprintf(stderr, "  --cnn-version N          CNN version: 1 (default) or 2\n");
   fprintf(stderr, "  --help                   Show this help\n");
 }
 
@@ -257,6 +266,650 @@ static bool save_ppm(const char* path, const std::vector<uint8_t>& pixels,
   return true;
 }
 
+// CNN v2 structures (matching CNNv2Effect)
+struct CNNv2LayerInfo {
+  uint32_t kernel_size;
+  uint32_t in_channels;
+  uint32_t out_channels;
+  uint32_t weight_offset;
+  uint32_t weight_count;
+};
+
+struct CNNv2LayerParams {
+  uint32_t kernel_size;
+  uint32_t in_channels;
+  uint32_t out_channels;
+  uint32_t weight_offset;
+  uint32_t is_output_layer;
+  float blend_amount;
+};
+
+struct CNNv2StaticFeatureParams {
+  uint32_t mip_level;
+  uint32_t padding[3];
+};
+
+// Convert RGBA32Uint (packed f16) texture to BGRA8Unorm
+static std::vector<uint8_t> readback_rgba32uint_to_bgra8(
+    WGPUDevice device, WGPUQueue queue, WGPUTexture texture,
+    int width, int height) {
+  // Create staging buffer
+  const uint32_t bytes_per_row = width * 16; // 4×u32 per pixel
+  const uint32_t padded_bytes_per_row = (bytes_per_row + 255) & ~255;
+  const size_t buffer_size = padded_bytes_per_row * height;
+
+  WGPUBufferDescriptor buffer_desc = {};
+  buffer_desc.size = buffer_size;
+  buffer_desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+  buffer_desc.mappedAtCreation = false;
+
+  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+
+  // Copy texture to buffer
+  WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+
+  WGPUTexelCopyTextureInfo src = {};
+  src.texture = texture;
+  src.mipLevel = 0;
+
+  WGPUTexelCopyBufferInfo dst = {};
+  dst.buffer = staging;
+  dst.layout.bytesPerRow = padded_bytes_per_row;
+  dst.layout.rowsPerImage = height;
+
+  WGPUExtent3D copy_size = {
+      static_cast<uint32_t>(width),
+      static_cast<uint32_t>(height),
+      1};
+
+  wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+
+  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+  wgpuQueueSubmit(queue, 1, &commands);
+  wgpuCommandBufferRelease(commands);
+  wgpuCommandEncoderRelease(encoder);
+
+  // Wait for copy to complete
+  wgpuDevicePoll(device, true, nullptr);
+
+  // Map and read buffer
+  struct MapState {
+    bool done = false;
+  };
+  MapState map_state;
+
+  auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
+                   void* userdata1, void* userdata2) {
+    (void)message;
+    (void)userdata2;
+    MapState* state = (MapState*)userdata1;
+    state->done = (status == WGPUMapAsyncStatus_Success);
+  };
+
+  WGPUBufferMapCallbackInfo map_info = {};
+  map_info.mode = WGPUCallbackMode_AllowProcessEvents;
+  map_info.callback = map_cb;
+  map_info.userdata1 = &map_state;
+
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
+
+  // Wait for mapping to complete
+  for (int i = 0; i < 100 && !map_state.done; ++i) {
+    wgpuDevicePoll(device, true, nullptr);
+  }
+
+  if (!map_state.done) {
+    fprintf(stderr, "Error: Buffer mapping timed out\n");
+    wgpuBufferRelease(staging);
+    return std::vector<uint8_t>();
+  }
+
+  const uint32_t* mapped =
+      (const uint32_t*)wgpuBufferGetConstMappedRange(staging, 0, buffer_size);
+
+  std::vector<uint8_t> result(width * height * 4);
+
+  // Unpack f16 to u8 (BGRA)
+  for (int y = 0; y < height; ++y) {
+    const uint32_t* row =
+        (const uint32_t*)((const uint8_t*)mapped + y * padded_bytes_per_row);
+    for (int x = 0; x < width; ++x) {
+      // Read 4×u32 (8×f16)
+      uint32_t data[4];
+      data[0] = row[x * 4 + 0];
+      data[1] = row[x * 4 + 1];
+      data[2] = row[x * 4 + 2];
+      data[3] = row[x * 4 + 3];
+
+      // Extract RGBA channels (first 4 f16 values)
+      uint16_t r16 = data[0] & 0xFFFF;
+      uint16_t g16 = (data[0] >> 16) & 0xFFFF;
+      uint16_t b16 = data[1] & 0xFFFF;
+      uint16_t a16 = (data[1] >> 16) & 0xFFFF;
+
+      // Convert f16 to f32 (simple decode)
+      auto f16_to_f32 = [](uint16_t h) -> float {
+        uint32_t sign = (h >> 15) & 1;
+        uint32_t exp = (h >> 10) & 0x1F;
+        uint32_t frac = h & 0x3FF;
+
+        if (exp == 0) {
+          if (frac == 0) return sign ? -0.0f : 0.0f;
+          // Denormal
+          float val = frac / 1024.0f / 16384.0f;
+          return sign ? -val : val;
+        }
+        if (exp == 31) {
+          return frac ? NAN : (sign ? -INFINITY : INFINITY);
+        }
+
+        int32_t e = exp - 15;
+        float val = (1.0f + frac / 1024.0f) * powf(2.0f, e);
+        return sign ? -val : val;
+      };
+
+      float r = f16_to_f32(r16);
+      float g = f16_to_f32(g16);
+      float b = f16_to_f32(b16);
+      float a = f16_to_f32(a16);
+
+      // Clamp to [0,1] and convert to u8
+      auto clamp_u8 = [](float v) -> uint8_t {
+        if (v <= 0.0f) return 0;
+        if (v >= 1.0f) return 255;
+        return static_cast<uint8_t>(v * 255.0f + 0.5f);
+      };
+
+      result[(y * width + x) * 4 + 0] = clamp_u8(b);
+      result[(y * width + x) * 4 + 1] = clamp_u8(g);
+      result[(y * width + x) * 4 + 2] = clamp_u8(r);
+      result[(y * width + x) * 4 + 3] = clamp_u8(a);
+    }
+  }
+
+  wgpuBufferUnmap(staging);
+  wgpuBufferRelease(staging);
+
+  return result;
+}
+
+// Process image with CNN v2
+static bool process_cnn_v2(WGPUDevice device, WGPUQueue queue,
+                           WGPUInstance instance, WGPUTexture input_texture,
+                           int width, int height, const Args& args) {
+  printf("Using CNN v2 (storage buffer architecture)\n");
+
+  // Load weights
+  size_t weights_size = 0;
+  const uint8_t* weights_data =
+      (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size);
+
+  if (!weights_data || weights_size < 20) {
+    fprintf(stderr, "Error: CNN v2 weights not available\n");
+    return false;
+  }
+
+  // Parse header
+  const uint32_t* header = (const uint32_t*)weights_data;
+  uint32_t magic = header[0];
+  uint32_t version = header[1];
+  uint32_t num_layers = header[2];
+  uint32_t total_weights = header[3];
+
+  if (magic != 0x324e4e43) {  // 'CNN2'
+    fprintf(stderr, "Error: Invalid CNN v2 weights magic\n");
+    return false;
+  }
+
+  uint32_t mip_level = 0;
+  if (version == 2) {
+    mip_level = header[4];
+  }
+
+  printf("Loaded CNN v2 weights: %u layers, %u weights, version %u\n",
+         num_layers, total_weights, version);
+
+  // Parse layer info
+  const uint32_t header_u32_count = (version == 1) ? 4 : 5;
+  const uint32_t* layer_data = header + header_u32_count;
+  std::vector<CNNv2LayerInfo> layer_info;
+
+  for (uint32_t i = 0; i < num_layers; ++i) {
+    CNNv2LayerInfo info;
+    info.kernel_size = layer_data[i * 5 + 0];
+    info.in_channels = layer_data[i * 5 + 1];
+    info.out_channels = layer_data[i * 5 + 2];
+    info.weight_offset = layer_data[i * 5 + 3];
+    info.weight_count = layer_data[i * 5 + 4];
+    layer_info.push_back(info);
+
+    printf("  Layer %u: %ux%u conv, %u→%u channels, %u weights\n", i,
+           info.kernel_size, info.kernel_size, info.in_channels,
+           info.out_channels, info.weight_count);
+  }
+
+  // Create weights storage buffer
+  WGPUBufferDescriptor weights_buffer_desc = {};
+  weights_buffer_desc.size = weights_size;
+  weights_buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst;
+  weights_buffer_desc.mappedAtCreation = false;
+
+  WGPUBuffer weights_buffer =
+      wgpuDeviceCreateBuffer(device, &weights_buffer_desc);
+  wgpuQueueWriteBuffer(queue, weights_buffer, 0, weights_data, weights_size);
+
+  // Create input view
+  const WGPUTextureViewDescriptor view_desc = {
+      .format = WGPUTextureFormat_BGRA8Unorm,
+      .dimension = WGPUTextureViewDimension_2D,
+      .baseMipLevel = 0,
+      .mipLevelCount = 1,
+      .baseArrayLayer = 0,
+      .arrayLayerCount = 1,
+  };
+  WGPUTextureView input_view = wgpuTextureCreateView(input_texture, &view_desc);
+
+  // Create static features texture (RGBA32Uint)
+  const WGPUTextureDescriptor static_desc = {
+      .usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc,
+      .dimension = WGPUTextureDimension_2D,
+      .size = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1},
+      .format = WGPUTextureFormat_RGBA32Uint,
+      .mipLevelCount = 1,
+      .sampleCount = 1,
+  };
+  WGPUTexture static_features_tex =
+      wgpuDeviceCreateTexture(device, &static_desc);
+  WGPUTextureView static_features_view =
+      wgpuTextureCreateView(static_features_tex, nullptr);
+
+  // Create layer textures (ping-pong)
+  WGPUTexture layer_textures[2] = {
+      wgpuDeviceCreateTexture(device, &static_desc),
+      wgpuDeviceCreateTexture(device, &static_desc),
+  };
+  WGPUTextureView layer_views[2] = {
+      wgpuTextureCreateView(layer_textures[0], nullptr),
+      wgpuTextureCreateView(layer_textures[1], nullptr),
+  };
+
+  // Load shaders
+  const char* static_shader =
+      SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC);
+  const char* layer_shader =
+      SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE);
+
+  if (!static_shader[0] || !layer_shader[0]) {
+    fprintf(stderr, "Error: CNN v2 shaders not available\n");
+    wgpuTextureViewRelease(static_features_view);
+    wgpuTextureRelease(static_features_tex);
+    wgpuTextureViewRelease(layer_views[0]);
+    wgpuTextureViewRelease(layer_views[1]);
+    wgpuTextureRelease(layer_textures[0]);
+    wgpuTextureRelease(layer_textures[1]);
+    wgpuBufferRelease(weights_buffer);
+    wgpuTextureViewRelease(input_view);
+    return false;
+  }
+
+  // Create static feature params buffer
+  WGPUBufferDescriptor static_params_desc = {};
+  static_params_desc.size = sizeof(CNNv2StaticFeatureParams);
+  static_params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  static_params_desc.mappedAtCreation = false;
+
+  WGPUBuffer static_params_buffer =
+      wgpuDeviceCreateBuffer(device, &static_params_desc);
+
+  CNNv2StaticFeatureParams static_params;
+  static_params.mip_level = mip_level;
+  static_params.padding[0] = 0;
+  static_params.padding[1] = 0;
+  static_params.padding[2] = 0;
+  wgpuQueueWriteBuffer(queue, static_params_buffer, 0, &static_params,
+                       sizeof(static_params));
+
+  // Create static features compute pipeline
+  WGPUShaderSourceWGSL static_wgsl = {};
+  static_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+  static_wgsl.code = str_view(static_shader);
+
+  WGPUShaderModuleDescriptor static_module_desc = {};
+  static_module_desc.nextInChain = &static_wgsl.chain;
+
+  WGPUShaderModule static_module =
+      wgpuDeviceCreateShaderModule(device, &static_module_desc);
+
+  // Bind group layout: 0=input, 1=input_mip1, 2=input_mip2, 3=depth, 4=output,
+  // 5=params
+  WGPUBindGroupLayoutEntry static_bgl_entries[6] = {};
+  static_bgl_entries[0].binding = 0;
+  static_bgl_entries[0].visibility = WGPUShaderStage_Compute;
+  static_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float;
+  static_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  static_bgl_entries[1].binding = 1;
+  static_bgl_entries[1].visibility = WGPUShaderStage_Compute;
+  static_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float;
+  static_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  static_bgl_entries[2].binding = 2;
+  static_bgl_entries[2].visibility = WGPUShaderStage_Compute;
+  static_bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float;
+  static_bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  static_bgl_entries[3].binding = 3;
+  static_bgl_entries[3].visibility = WGPUShaderStage_Compute;
+  static_bgl_entries[3].texture.sampleType = WGPUTextureSampleType_Float;
+  static_bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  static_bgl_entries[4].binding = 4;
+  static_bgl_entries[4].visibility = WGPUShaderStage_Compute;
+  static_bgl_entries[4].storageTexture.access =
+      WGPUStorageTextureAccess_WriteOnly;
+  static_bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
+  static_bgl_entries[4].storageTexture.viewDimension =
+      WGPUTextureViewDimension_2D;
+
+  static_bgl_entries[5].binding = 5;
+  static_bgl_entries[5].visibility = WGPUShaderStage_Compute;
+  static_bgl_entries[5].buffer.type = WGPUBufferBindingType_Uniform;
+  static_bgl_entries[5].buffer.minBindingSize =
+      sizeof(CNNv2StaticFeatureParams);
+
+  WGPUBindGroupLayoutDescriptor static_bgl_desc = {};
+  static_bgl_desc.entryCount = 6;
+  static_bgl_desc.entries = static_bgl_entries;
+
+  WGPUBindGroupLayout static_bgl =
+      wgpuDeviceCreateBindGroupLayout(device, &static_bgl_desc);
+
+  WGPUPipelineLayoutDescriptor static_pl_desc = {};
+  static_pl_desc.bindGroupLayoutCount = 1;
+  static_pl_desc.bindGroupLayouts = &static_bgl;
+
+  WGPUPipelineLayout static_pl =
+      wgpuDeviceCreatePipelineLayout(device, &static_pl_desc);
+
+  WGPUComputePipelineDescriptor static_pipeline_desc = {};
+  static_pipeline_desc.compute.module = static_module;
+  static_pipeline_desc.compute.entryPoint = str_view("main");
+  static_pipeline_desc.layout = static_pl;
+
+  WGPUComputePipeline static_pipeline =
+      wgpuDeviceCreateComputePipeline(device, &static_pipeline_desc);
+
+  wgpuShaderModuleRelease(static_module);
+  wgpuPipelineLayoutRelease(static_pl);
+
+  // Create static bind group (use input as all mips for simplicity)
+  WGPUBindGroupEntry static_bg_entries[6] = {};
+  static_bg_entries[0].binding = 0;
+  static_bg_entries[0].textureView = input_view;
+  static_bg_entries[1].binding = 1;
+  static_bg_entries[1].textureView = input_view;
+  static_bg_entries[2].binding = 2;
+  static_bg_entries[2].textureView = input_view;
+  static_bg_entries[3].binding = 3;
+  static_bg_entries[3].textureView = input_view;  // Depth (use input)
+  static_bg_entries[4].binding = 4;
+  static_bg_entries[4].textureView = static_features_view;
+  static_bg_entries[5].binding = 5;
+  static_bg_entries[5].buffer = static_params_buffer;
+  static_bg_entries[5].size = sizeof(CNNv2StaticFeatureParams);
+
+  WGPUBindGroupDescriptor static_bg_desc = {};
+  static_bg_desc.layout = static_bgl;
+  static_bg_desc.entryCount = 6;
+  static_bg_desc.entries = static_bg_entries;
+
+  WGPUBindGroup static_bg = wgpuDeviceCreateBindGroup(device, &static_bg_desc);
+
+  wgpuBindGroupLayoutRelease(static_bgl);
+
+  // Create layer compute pipeline
+  WGPUShaderSourceWGSL layer_wgsl = {};
+  layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+  layer_wgsl.code = str_view(layer_shader);
+
+  WGPUShaderModuleDescriptor layer_module_desc = {};
+  layer_module_desc.nextInChain = &layer_wgsl.chain;
+
+  WGPUShaderModule layer_module =
+      wgpuDeviceCreateShaderModule(device, &layer_module_desc);
+
+  // Layer bind group layout:
+  // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params,
+  // 5=original
+  WGPUBindGroupLayoutEntry layer_bgl_entries[6] = {};
+  layer_bgl_entries[0].binding = 0;
+  layer_bgl_entries[0].visibility = WGPUShaderStage_Compute;
+  layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint;
+  layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  layer_bgl_entries[1].binding = 1;
+  layer_bgl_entries[1].visibility = WGPUShaderStage_Compute;
+  layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint;
+  layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  layer_bgl_entries[2].binding = 2;
+  layer_bgl_entries[2].visibility = WGPUShaderStage_Compute;
+  layer_bgl_entries[2].storageTexture.access =
+      WGPUStorageTextureAccess_WriteOnly;
+  layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
+  layer_bgl_entries[2].storageTexture.viewDimension =
+      WGPUTextureViewDimension_2D;
+
+  layer_bgl_entries[3].binding = 3;
+  layer_bgl_entries[3].visibility = WGPUShaderStage_Compute;
+  layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  layer_bgl_entries[4].binding = 4;
+  layer_bgl_entries[4].visibility = WGPUShaderStage_Compute;
+  layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+  layer_bgl_entries[4].buffer.minBindingSize = sizeof(CNNv2LayerParams);
+
+  layer_bgl_entries[5].binding = 5;
+  layer_bgl_entries[5].visibility = WGPUShaderStage_Compute;
+  layer_bgl_entries[5].texture.sampleType = WGPUTextureSampleType_Float;
+  layer_bgl_entries[5].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+  WGPUBindGroupLayoutDescriptor layer_bgl_desc = {};
+  layer_bgl_desc.entryCount = 6;
+  layer_bgl_desc.entries = layer_bgl_entries;
+
+  WGPUBindGroupLayout layer_bgl =
+      wgpuDeviceCreateBindGroupLayout(device, &layer_bgl_desc);
+
+  WGPUPipelineLayoutDescriptor layer_pl_desc = {};
+  layer_pl_desc.bindGroupLayoutCount = 1;
+  layer_pl_desc.bindGroupLayouts = &layer_bgl;
+
+  WGPUPipelineLayout layer_pl =
+      wgpuDeviceCreatePipelineLayout(device, &layer_pl_desc);
+
+  WGPUComputePipelineDescriptor layer_pipeline_desc = {};
+  layer_pipeline_desc.compute.module = layer_module;
+  layer_pipeline_desc.compute.entryPoint = str_view("main");
+  layer_pipeline_desc.layout = layer_pl;
+
+  WGPUComputePipeline layer_pipeline =
+      wgpuDeviceCreateComputePipeline(device, &layer_pipeline_desc);
+
+  wgpuShaderModuleRelease(layer_module);
+  wgpuPipelineLayoutRelease(layer_pl);
+
+  // Create layer params buffers
+  std::vector<WGPUBuffer> layer_params_buffers;
+  for (size_t i = 0; i < layer_info.size(); ++i) {
+    WGPUBufferDescriptor params_desc = {};
+    params_desc.size = sizeof(CNNv2LayerParams);
+    params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+    params_desc.mappedAtCreation = false;
+
+    WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &params_desc);
+    layer_params_buffers.push_back(buf);
+  }
+
+  // Execute compute passes
+  WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+
+  // Pass 1: Static features
+  printf("Computing static features...\n");
+  WGPUComputePassEncoder static_pass =
+      wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+  wgpuComputePassEncoderSetPipeline(static_pass, static_pipeline);
+  wgpuComputePassEncoderSetBindGroup(static_pass, 0, static_bg, 0, nullptr);
+
+  uint32_t workgroups_x = (width + 7) / 8;
+  uint32_t workgroups_y = (height + 7) / 8;
+  wgpuComputePassEncoderDispatchWorkgroups(static_pass, workgroups_x,
+                                            workgroups_y, 1);
+
+  wgpuComputePassEncoderEnd(static_pass);
+  wgpuComputePassEncoderRelease(static_pass);
+
+  // Pass 2-N: CNN layers
+  for (size_t i = 0; i < layer_info.size(); ++i) {
+    const CNNv2LayerInfo& info = layer_info[i];
+
+    printf("Processing layer %zu/%zu (%ux%u, %u→%u channels)...\n", i + 1,
+           layer_info.size(), info.kernel_size, info.kernel_size,
+           info.in_channels, info.out_channels);
+
+    // Update layer params
+    CNNv2LayerParams params;
+    params.kernel_size = info.kernel_size;
+    params.in_channels = info.in_channels;
+    params.out_channels = info.out_channels;
+    params.weight_offset = info.weight_offset;
+    params.is_output_layer = (i == layer_info.size() - 1) ? 1 : 0;
+    params.blend_amount = args.blend;
+
+    wgpuQueueWriteBuffer(queue, layer_params_buffers[i], 0, &params,
+                         sizeof(params));
+
+    // Create bind group for this layer
+    WGPUBindGroupEntry layer_bg_entries[6] = {};
+    layer_bg_entries[0].binding = 0;
+    layer_bg_entries[0].textureView = static_features_view;
+
+    layer_bg_entries[1].binding = 1;
+    layer_bg_entries[1].textureView =
+        (i == 0) ? static_features_view : layer_views[i % 2];
+
+    layer_bg_entries[2].binding = 2;
+    layer_bg_entries[2].textureView = layer_views[(i + 1) % 2];
+
+    layer_bg_entries[3].binding = 3;
+    layer_bg_entries[3].buffer = weights_buffer;
+    layer_bg_entries[3].size = weights_size;
+
+    layer_bg_entries[4].binding = 4;
+    layer_bg_entries[4].buffer = layer_params_buffers[i];
+    layer_bg_entries[4].size = sizeof(CNNv2LayerParams);
+
+    layer_bg_entries[5].binding = 5;
+    layer_bg_entries[5].textureView = input_view;
+
+    WGPUBindGroupDescriptor layer_bg_desc = {};
+    layer_bg_desc.layout = layer_bgl;
+    layer_bg_desc.entryCount = 6;
+    layer_bg_desc.entries = layer_bg_entries;
+
+    WGPUBindGroup layer_bg =
+        wgpuDeviceCreateBindGroup(device, &layer_bg_desc);
+
+    WGPUComputePassEncoder layer_pass =
+        wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+    wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline);
+    wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bg, 0, nullptr);
+
+    wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x,
+                                              workgroups_y, 1);
+
+    wgpuComputePassEncoderEnd(layer_pass);
+    wgpuComputePassEncoderRelease(layer_pass);
+    wgpuBindGroupRelease(layer_bg);
+  }
+
+  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+  wgpuQueueSubmit(queue, 1, &commands);
+  wgpuCommandBufferRelease(commands);
+  wgpuCommandEncoderRelease(encoder);
+
+  wgpuDevicePoll(device, true, nullptr);
+
+  // Readback final result (from last layer's output texture)
+  printf("Reading pixels from GPU...\n");
+  size_t final_layer_idx = (layer_info.size()) % 2;
+  std::vector<uint8_t> pixels = readback_rgba32uint_to_bgra8(
+      device, queue, layer_textures[final_layer_idx], width, height);
+
+  if (pixels.empty()) {
+    fprintf(stderr, "Error: GPU readback failed\n");
+    for (auto buf : layer_params_buffers) wgpuBufferRelease(buf);
+    wgpuComputePipelineRelease(layer_pipeline);
+    wgpuBindGroupLayoutRelease(layer_bgl);
+    wgpuBindGroupRelease(static_bg);
+    wgpuComputePipelineRelease(static_pipeline);
+    wgpuBufferRelease(static_params_buffer);
+    wgpuTextureViewRelease(static_features_view);
+    wgpuTextureRelease(static_features_tex);
+    wgpuTextureViewRelease(layer_views[0]);
+    wgpuTextureViewRelease(layer_views[1]);
+    wgpuTextureRelease(layer_textures[0]);
+    wgpuTextureRelease(layer_textures[1]);
+    wgpuBufferRelease(weights_buffer);
+    wgpuTextureViewRelease(input_view);
+    return false;
+  }
+
+  // Debug hex dump
+  if (args.debug_hex) {
+    printf("First 8 pixels (BGRA hex):\n");
+    for (int i = 0; i < 8 && i < width * height; ++i) {
+      const uint8_t b = pixels[i * 4 + 0];
+      const uint8_t g = pixels[i * 4 + 1];
+      const uint8_t r = pixels[i * 4 + 2];
+      const uint8_t a = pixels[i * 4 + 3];
+      printf("  [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
+    }
+  }
+
+  // Save output
+  bool success;
+  if (args.output_png) {
+    printf("Saving PNG to '%s'...\n", args.output_path);
+    success = save_png(args.output_path, pixels, width, height);
+  } else {
+    printf("Saving PPM to '%s'...\n", args.output_path);
+    success = save_ppm(args.output_path, pixels, width, height);
+  }
+
+  if (success) {
+    printf("Done! Output saved to '%s'\n", args.output_path);
+  }
+
+  // Cleanup
+  for (auto buf : layer_params_buffers) wgpuBufferRelease(buf);
+  wgpuComputePipelineRelease(layer_pipeline);
+  wgpuBindGroupLayoutRelease(layer_bgl);
+  wgpuBindGroupRelease(static_bg);
+  wgpuComputePipelineRelease(static_pipeline);
+  wgpuBufferRelease(static_params_buffer);
+  wgpuTextureViewRelease(static_features_view);
+  wgpuTextureRelease(static_features_tex);
+  wgpuTextureViewRelease(layer_views[0]);
+  wgpuTextureViewRelease(layer_views[1]);
+  wgpuTextureRelease(layer_textures[0]);
+  wgpuTextureRelease(layer_textures[1]);
+  wgpuBufferRelease(weights_buffer);
+  wgpuTextureViewRelease(input_view);
+
+  return success;
+}
+
 int main(int argc, char** argv) {
   // Parse arguments
   Args args;
@@ -292,6 +945,19 @@ int main(int argc, char** argv) {
 
   printf("Loaded %dx%d image from '%s'\n", width, height, args.input_path);
 
+  // Branch based on CNN version
+  if (args.cnn_version == 2) {
+    bool success = process_cnn_v2(device, queue, instance, input_texture,
+                                   width, height, args);
+    wgpuTextureRelease(input_texture);
+    SamplerCache::Get().clear();
+    fixture.shutdown();
+    return success ? 0 : 1;
+  }
+
+  // CNN v1 processing below
+  printf("Using CNN v1 (render pipeline architecture)\n");
+
   // Create input texture view
   const WGPUTextureViewDescriptor view_desc = {
       .format = WGPUTextureFormat_BGRA8Unorm,