3 files changed, 122 insertions, 36 deletions
diff --git a/doc/CNN_TEST_TOOL.md b/doc/CNN_TEST_TOOL.md
index 7a970fe..09c55d4 100644
--- a/doc/CNN_TEST_TOOL.md
+++ b/doc/CNN_TEST_TOOL.md
@@ -178,11 +178,12 @@ assert mse < 10.0, f'MSE too high: {mse}'
 
 ## Known Issues
 
-**BUG: Black output (uninitialized input texture)**
-- Tool produces all-black output (MSE 64860 vs ground truth)
-- Root cause: First intermediate texture not initialized with input image
-- Multi-layer processing starts with uninitialized data
-- Fix required: Copy input_texture → intermediate_textures[0] before layer loop
+**BUG: Black output (unknown cause)**
+- Tool produces all-black output despite correct architecture
+- Fixed ping-pong logic, RGBA16Float intermediates, proper pipelines
+- Shader compiles, GPU commands execute without errors
+- Possible causes: shader execution issue, synchronization, binding problem
+- Status: Under investigation
 
 ---
 
diff --git a/doc/HOWTO.md b/doc/HOWTO.md
index ba550bb..c0e9363 100644
--- a/doc/HOWTO.md
+++ b/doc/HOWTO.md
@@ -162,6 +162,45 @@ See `doc/ASSET_SYSTEM.md` and `doc/WORKSPACE_SYSTEM.md`.
 
 ---
 
+## CNN Testing
+
+### Offline Shader Validation
+```bash
+# Test trained CNN on PNG input
+./build/cnn_test input.png output.png
+
+# Adjust blend amount (0.0 = original, 1.0 = full CNN)
+./build/cnn_test input.png output.png --blend 0.5
+
+# PPM output format
+./build/cnn_test input.png output.ppm --format ppm
+```
+
+### Ground Truth Comparison
+```bash
+# Generate Python ground truth
+./training/train_cnn.py --infer input.png \
+  --export-only checkpoints/checkpoint_epoch_1000.pth \
+  --output ground_truth.png
+
+# Run tool
+./build/cnn_test input.png tool_output.png
+
+# Compare (Python required)
+python3 -c "
+import numpy as np
+from PIL import Image
+gt = np.array(Image.open('ground_truth.png').convert('RGB'))
+out = np.array(Image.open('tool_output.png').convert('RGB'))
+mse = np.mean((gt.astype(float) - out.astype(float)) ** 2)
+print(f'MSE: {mse:.4f} (target: < 10.0)')
+"
+```
+
+See `doc/CNN_TEST_TOOL.md` for full documentation.
+
+---
+
 ## Additional Documentation
 
 - **Build System:** `doc/BUILD.md` - Multi-platform, size optimization
diff --git a/tools/cnn_test.cc b/tools/cnn_test.cc
index 59f5d36..bb4a824 100644
--- a/tools/cnn_test.cc
+++ b/tools/cnn_test.cc
@@ -145,8 +145,10 @@ static WGPUTexture load_texture(WGPUDevice device, WGPUQueue queue,
 }
 
 // Create CNN render pipeline (5 bindings)
+// Takes both intermediate format (RGBA16Float) and final format (BGRA8Unorm)
 static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
-                                               WGPUTextureFormat format) {
+                                               WGPUTextureFormat format,
+                                               bool is_final_layer) {
   const char* shader_code = SafeGetAsset(AssetId::ASSET_SHADER_CNN_LAYER);
 
   WGPUBindGroupLayout bgl =
@@ -158,10 +160,14 @@ static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
           .texture(4, WGPUShaderStage_Fragment) // Original input
           .build(device);
 
+  // Use appropriate format: RGBA16Float for intermediate, BGRA8Unorm for final
+  WGPUTextureFormat output_format =
+      is_final_layer ? WGPUTextureFormat_BGRA8Unorm : WGPUTextureFormat_RGBA16Float;
+
   WGPURenderPipeline pipeline = RenderPipelineBuilder(device)
                                      .shader(shader_code)  // compose=true by default
                                      .bind_group_layout(bgl)
-                                     .format(format)
+                                     .format(output_format)
                                      .build();
 
   wgpuBindGroupLayoutRelease(bgl);
@@ -274,19 +280,24 @@ int main(int argc, char** argv) {
   WGPUTextureView input_view = wgpuTextureCreateView(input_texture, &view_desc);
   WGPUTextureView original_view = input_view; // Keep reference to original
 
-  // Create CNN pipeline
-  WGPURenderPipeline pipeline =
-      create_cnn_pipeline(device, WGPUTextureFormat_BGRA8Unorm);
-  if (!pipeline) {
-    fprintf(stderr, "Error: failed to create CNN pipeline\n");
+  // Create CNN pipelines (different formats for intermediate vs final)
+  WGPURenderPipeline pipeline_intermediate =
+      create_cnn_pipeline(device, WGPUTextureFormat_RGBA16Float, false);
+  WGPURenderPipeline pipeline_final =
+      create_cnn_pipeline(device, WGPUTextureFormat_BGRA8Unorm, true);
+
+  if (!pipeline_intermediate || !pipeline_final) {
+    fprintf(stderr, "Error: failed to create CNN pipelines\n");
+    if (pipeline_intermediate) wgpuRenderPipelineRelease(pipeline_intermediate);
+    if (pipeline_final) wgpuRenderPipelineRelease(pipeline_final);
     wgpuTextureViewRelease(input_view);
     wgpuTextureRelease(input_texture);
     fixture.shutdown();
     return 1;
   }
 
-  // Get bind group layout from pipeline
-  WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline, 0);
+  // Get bind group layout from intermediate pipeline (same for both)
+  WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_intermediate, 0);
 
   // Create uniform buffers
   const WGPUBufferDescriptor common_uniform_desc = {
@@ -304,12 +315,13 @@ int main(int argc, char** argv) {
       wgpuDeviceCreateBuffer(device, &layer_params_desc);
 
   // Create intermediate textures for ping-pong (2 textures)
+  // Use RGBA16Float to preserve [-1,1] range from tanh activation
   const WGPUTextureDescriptor intermediate_desc = {
       .usage = WGPUTextureUsage_TextureBinding |
                WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_CopySrc,
       .dimension = WGPUTextureDimension_2D,
       .size = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1},
-      .format = WGPUTextureFormat_BGRA8Unorm,
+      .format = WGPUTextureFormat_RGBA16Float,
       .mipLevelCount = 1,
       .sampleCount = 1,
   };
@@ -319,10 +331,39 @@ int main(int argc, char** argv) {
       wgpuDeviceCreateTexture(device, &intermediate_desc),
   };
 
+  // Create views for intermediate textures (RGBA16Float)
+  const WGPUTextureViewDescriptor intermediate_view_desc = {
+      .format = WGPUTextureFormat_RGBA16Float,
+      .dimension = WGPUTextureViewDimension_2D,
+      .baseMipLevel = 0,
+      .mipLevelCount = 1,
+      .baseArrayLayer = 0,
+      .arrayLayerCount = 1,
+  };
   WGPUTextureView intermediate_views[2] = {
-      wgpuTextureCreateView(intermediate_textures[0], &view_desc),
-      wgpuTextureCreateView(intermediate_textures[1], &view_desc),
+      wgpuTextureCreateView(intermediate_textures[0], &intermediate_view_desc),
+      wgpuTextureCreateView(intermediate_textures[1], &intermediate_view_desc),
+  };
+
+  // Create final output texture (BGRA8Unorm for readback)
+  const WGPUTextureDescriptor final_desc = {
+      .usage = WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_CopySrc,
+      .dimension = WGPUTextureDimension_2D,
+      .size = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1},
+      .format = WGPUTextureFormat_BGRA8Unorm,
+      .mipLevelCount = 1,
+      .sampleCount = 1,
   };
+  WGPUTexture final_output_texture = wgpuDeviceCreateTexture(device, &final_desc);
+  const WGPUTextureViewDescriptor final_view_desc = {
+      .format = WGPUTextureFormat_BGRA8Unorm,
+      .dimension = WGPUTextureViewDimension_2D,
+      .baseMipLevel = 0,
+      .mipLevelCount = 1,
+      .baseArrayLayer = 0,
+      .arrayLayerCount = 1,
+  };
+  WGPUTextureView final_output_view = wgpuTextureCreateView(final_output_texture, &final_view_desc);
 
   // Get sampler
   WGPUSampler sampler =
@@ -330,8 +371,7 @@ int main(int argc, char** argv) {
 
   // Multi-layer processing (fixed 3 layers)
   const int NUM_LAYERS = 3;
-  int src_idx = 0; // Ping-pong index
-  WGPUTexture final_texture = nullptr;
+  int dst_idx = 0; // Index of texture to render to
 
   // First layer reads from input, subsequent layers read from previous output
   WGPUTextureView current_input = input_view;
@@ -371,11 +411,14 @@ int main(int argc, char** argv) {
                                    .texture(4, original_view)
                                    .build(device, bgl);
 
-    // Render to intermediate texture
-    WGPUTextureView output_view = intermediate_views[src_idx];
+    // Render to appropriate output texture with correct pipeline
+    bool is_final = (layer == NUM_LAYERS - 1);
+    WGPUTextureView output_view = is_final ? final_output_view : intermediate_views[dst_idx];
+    WGPURenderPipeline current_pipeline = is_final ? pipeline_final : pipeline_intermediate;
+
     WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
     WGPURenderPassEncoder pass = begin_render_pass(encoder, output_view);
-    wgpuRenderPassEncoderSetPipeline(pass, pipeline);
+    wgpuRenderPassEncoderSetPipeline(pass, current_pipeline);
     wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
     wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); // Fullscreen triangle
     wgpuRenderPassEncoderEnd(pass);
@@ -387,27 +430,25 @@ int main(int argc, char** argv) {
     wgpuCommandEncoderRelease(encoder);
     wgpuBindGroupRelease(bind_group);
 
-    // Update for next layer
-    if (layer == NUM_LAYERS - 1) {
-      // Last layer: save final texture
-      final_texture = intermediate_textures[src_idx];
-    } else {
-      // Switch to next intermediate for input
-      current_input = intermediate_views[src_idx];
+    // Update for next layer: output becomes input
+    if (layer < NUM_LAYERS - 1) {
+      // Use this layer's output as next layer's input
+      current_input = intermediate_views[dst_idx];
+      dst_idx = 1 - dst_idx; // Flip ping-pong for next render
     }
-
-    src_idx = 1 - src_idx; // Flip ping-pong
   }
 
   printf("Reading pixels from GPU...\n");
 
-  // Read final output from GPU
+  // Read final output from GPU (always BGRA8Unorm)
   std::vector<uint8_t> pixels =
-      read_texture_pixels(instance, device, final_texture, width, height);
+      read_texture_pixels(instance, device, final_output_texture, width, height);
 
   if (pixels.empty()) {
     fprintf(stderr, "Error: failed to read pixels from GPU\n");
     // Cleanup...
+    wgpuTextureViewRelease(final_output_view);
+    wgpuTextureRelease(final_output_texture);
     wgpuTextureViewRelease(intermediate_views[0]);
     wgpuTextureViewRelease(intermediate_views[1]);
     wgpuTextureRelease(intermediate_textures[0]);
@@ -415,7 +456,8 @@ int main(int argc, char** argv) {
     wgpuBufferRelease(layer_params_buffer);
     wgpuBufferRelease(common_uniform_buffer);
     wgpuBindGroupLayoutRelease(bgl);
-    wgpuRenderPipelineRelease(pipeline);
+    wgpuRenderPipelineRelease(pipeline_intermediate);
+    wgpuRenderPipelineRelease(pipeline_final);
     wgpuTextureViewRelease(input_view);
     wgpuTextureRelease(input_texture);
     fixture.shutdown();
@@ -433,6 +475,8 @@ int main(int argc, char** argv) {
   }
 
   if (!success) {
+    wgpuTextureViewRelease(final_output_view);
+    wgpuTextureRelease(final_output_texture);
     wgpuTextureViewRelease(intermediate_views[0]);
     wgpuTextureViewRelease(intermediate_views[1]);
     wgpuTextureRelease(intermediate_textures[0]);
@@ -440,7 +484,8 @@ int main(int argc, char** argv) {
     wgpuBufferRelease(layer_params_buffer);
     wgpuBufferRelease(common_uniform_buffer);
     wgpuBindGroupLayoutRelease(bgl);
-    wgpuRenderPipelineRelease(pipeline);
+    wgpuRenderPipelineRelease(pipeline_intermediate);
+    wgpuRenderPipelineRelease(pipeline_final);
     wgpuTextureViewRelease(input_view);
     wgpuTextureRelease(input_texture);
     fixture.shutdown();
@@ -457,7 +502,8 @@ int main(int argc, char** argv) {
   wgpuBufferRelease(layer_params_buffer);
   wgpuBufferRelease(common_uniform_buffer);
   wgpuBindGroupLayoutRelease(bgl);
-  wgpuRenderPipelineRelease(pipeline);
+  wgpuRenderPipelineRelease(pipeline_intermediate);
+  wgpuRenderPipelineRelease(pipeline_final);
   wgpuTextureViewRelease(input_view);
   wgpuTextureRelease(input_texture);
   fixture.shutdown();