feat: Add CNN shader testing tool with GPU texture readback

Core GPU Utility (texture_readback): - Reusable synchronous texture-to-CPU readback (~150 lines) - STRIP_ALL guards (0 bytes in release builds) - Handles COPY_BYTES_PER_ROW_ALIGNMENT (256-byte alignment) - Refactored OffscreenRenderTarget to use new utility CNN Test Tool (cnn_test): - Standalone PNG→3-layer CNN→PNG/PPM tool (~450 lines) - --blend parameter (0.0-1.0) for final layer mixing - --format option (png/ppm) for output format - ShaderComposer integration for include resolution Build Integration: - Added texture_readback.cc to GPU_SOURCES (both sections) - Tool target with STB_IMAGE support Testing: - All 36 tests pass (100%) - Processes 64×64 and 555×370 images successfully - Ground-truth validation setup complete Known Issues: - BUG: Tool produces black output (uninitialized input texture) - First intermediate texture not initialized before layer loop - MSE 64860 vs Python ground truth (expected <10) - Fix required: Copy input to intermediate[0] before processing Documentation: - doc/CNN_TEST_TOOL.md - Full technical reference - Updated PROJECT_CONTEXT.md and COMPLETED.md handoff(Claude): CNN test tool foundation complete, needs input init bugfix Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
author: skal <pascal.massimino@gmail.com> 2026-02-11 07:07:29 +0100
committer: skal <pascal.massimino@gmail.com> 2026-02-11 07:07:29 +0100
commit: 3915a5e1c8c904f8f2154845cb99223a598653ee (patch)
tree: cb0e75dea7f8aa729d3b440a5e81b3ac811f8f04 /src
parent: 01e640be66f9d72c22417403eb88e18d6747866f (diff)
3 files changed, 170 insertions, 99 deletions
diff --git a/src/gpu/texture_readback.cc b/src/gpu/texture_readback.cc
new file mode 100644
index 0000000..3a690d3
--- /dev/null
+++ b/src/gpu/texture_readback.cc
@@ -0,0 +1,143 @@
+// GPU texture readback utility implementation
+// Extracts texture pixels to CPU memory for offline processing
+
+#include "gpu/texture_readback.h"
+
+#if !defined(STRIP_ALL)
+
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+
+// Callback state for async buffer mapping
+struct MapState {
+  bool done = false;
+  WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Unknown;
+};
+
+std::vector<uint8_t> read_texture_pixels(
+    WGPUInstance instance,
+    WGPUDevice device,
+    WGPUTexture texture,
+    int width,
+    int height) {
+
+  // Align bytes per row to 256 (COPY_BYTES_PER_ROW_ALIGNMENT)
+  const uint32_t bytes_per_pixel = 4; // BGRA8
+  const uint32_t unaligned_bytes_per_row = width * bytes_per_pixel;
+  const uint32_t aligned_bytes_per_row =
+      ((unaligned_bytes_per_row + 255) / 256) * 256;
+
+  const size_t buffer_size = aligned_bytes_per_row * height;
+  std::vector<uint8_t> pixels(width * height * bytes_per_pixel);
+
+  // Create staging buffer for readback (with aligned size)
+  const WGPUBufferDescriptor buffer_desc = {
+      .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
+      .size = buffer_size,
+  };
+  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+  assert(staging && "Failed to create staging buffer");
+
+  // Create command encoder for copy operation
+  const WGPUCommandEncoderDescriptor enc_desc = {};
+  WGPUCommandEncoder encoder =
+      wgpuDeviceCreateCommandEncoder(device, &enc_desc);
+
+  // Copy texture to buffer
+  const WGPUTexelCopyTextureInfo src = {
+      .texture = texture,
+      .mipLevel = 0,
+      .origin = {0, 0, 0},
+  };
+
+  const WGPUTexelCopyBufferInfo dst = {
+      .buffer = staging,
+      .layout =
+          {
+              .bytesPerRow = aligned_bytes_per_row,
+              .rowsPerImage = static_cast<uint32_t>(height),
+          },
+  };
+
+  const WGPUExtent3D copy_size = {static_cast<uint32_t>(width),
+                                  static_cast<uint32_t>(height), 1};
+
+  wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+
+  // Submit commands
+  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+  WGPUQueue queue = wgpuDeviceGetQueue(device);
+  wgpuQueueSubmit(queue, 1, &commands);
+  wgpuCommandBufferRelease(commands);
+  wgpuCommandEncoderRelease(encoder);
+
+  // Map buffer for reading (API differs between Win32 and native)
+#if defined(DEMO_CROSS_COMPILE_WIN32)
+  // Win32: Old callback API
+  MapState map_state = {};
+  auto map_cb = [](WGPUBufferMapAsyncStatus status, void* userdata) {
+    MapState* state = static_cast<MapState*>(userdata);
+    state->status = status;
+    state->done = true;
+  };
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_cb,
+                     &map_state);
+#else
+  // Native: New callback info API
+  MapState map_state = {};
+  auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
+                   void* userdata, void* user2) {
+    (void)message;
+    (void)user2;
+    MapState* state = static_cast<MapState*>(userdata);
+    state->status = status;
+    state->done = true;
+  };
+  WGPUBufferMapCallbackInfo map_info = {};
+  map_info.mode = WGPUCallbackMode_WaitAnyOnly;
+  map_info.callback = map_cb;
+  map_info.userdata1 = &map_state;
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
+#endif
+
+  // Wait for mapping to complete (synchronous blocking)
+  for (int i = 0; i < 100 && !map_state.done; ++i) {
+#if defined(__EMSCRIPTEN__)
+    emscripten_sleep(10);
+#else
+    wgpuInstanceProcessEvents(instance);
+#endif
+  }
+
+  if (map_state.status != WGPUMapAsyncStatus_Success) {
+    fprintf(stderr, "Buffer mapping failed: %d\n", map_state.status);
+    wgpuBufferRelease(staging);
+    return pixels; // Return empty
+  }
+
+  // Copy data from mapped buffer (handle row padding)
+  const uint8_t* mapped_data = static_cast<const uint8_t*>(
+      wgpuBufferGetConstMappedRange(staging, 0, buffer_size));
+  if (mapped_data) {
+    // If rows are aligned, copy row by row to remove padding
+    if (aligned_bytes_per_row != unaligned_bytes_per_row) {
+      for (int y = 0; y < height; ++y) {
+        memcpy(pixels.data() + y * unaligned_bytes_per_row,
+               mapped_data + y * aligned_bytes_per_row,
+               unaligned_bytes_per_row);
+      }
+    } else {
+      // No padding, direct copy
+      memcpy(pixels.data(), mapped_data, pixels.size());
+    }
+  }
+
+  // Cleanup
+  wgpuBufferUnmap(staging);
+  wgpuBufferRelease(staging);
+
+  return pixels;
+}
+
+#endif // !defined(STRIP_ALL)
diff --git a/src/gpu/texture_readback.h b/src/gpu/texture_readback.h
new file mode 100644
index 0000000..1bf770f
--- /dev/null
+++ b/src/gpu/texture_readback.h
@@ -0,0 +1,23 @@
+// GPU texture readback utility for offline processing
+// Synchronous blocking operation (waits for GPU completion)
+
+#pragma once
+
+// Protected with STRIP_ALL: only needed for dev tools, not final release
+#if !defined(STRIP_ALL)
+
+#include "platform/platform.h"
+#include <vector>
+#include <cstdint>
+
+// Read texture pixels to CPU memory (synchronous, blocking)
+// Format: BGRA8Unorm (4 bytes per pixel)
+// Returns: width * height * 4 bytes
+std::vector<uint8_t> read_texture_pixels(
+    WGPUInstance instance,
+    WGPUDevice device,
+    WGPUTexture texture,
+    int width,
+    int height);
+
+#endif // !defined(STRIP_ALL)
diff --git a/src/tests/common/offscreen_render_target.cc b/src/tests/common/offscreen_render_target.cc
index 9f65e9a..10775a1 100644
--- a/src/tests/common/offscreen_render_target.cc
+++ b/src/tests/common/offscreen_render_target.cc
@@ -3,6 +3,7 @@
 // Provides pixel readback for validation.
 
 #include "offscreen_render_target.h"
+#include "gpu/texture_readback.h"
 #include <cassert>
 #include <cstdio>
 #include <cstring>
@@ -64,105 +65,9 @@ WGPUBuffer OffscreenRenderTarget::create_staging_buffer() {
 }
 
 std::vector<uint8_t> OffscreenRenderTarget::read_pixels() {
-  const size_t buffer_size = width_ * height_ * 4; // BGRA8
-  std::vector<uint8_t> pixels(buffer_size);
-
-  // Create staging buffer for readback
-  WGPUBuffer staging = create_staging_buffer();
-  assert(staging && "Failed to create staging buffer");
-
-  // Create command encoder for copy operation
-  const WGPUCommandEncoderDescriptor enc_desc = {};
-  WGPUCommandEncoder encoder =
-      wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
-
-  // Copy texture to buffer
-  const WGPUTexelCopyTextureInfo src = {
-      .texture = texture_,
-      .mipLevel = 0,
-      .origin = {0, 0, 0},
-  };
-
-  const WGPUTexelCopyBufferInfo dst = {
-      .buffer = staging,
-      .layout =
-          {
-              .bytesPerRow = static_cast<uint32_t>(width_ * 4),
-              .rowsPerImage = static_cast<uint32_t>(height_),
-          },
-  };
-
-  const WGPUExtent3D copy_size = {static_cast<uint32_t>(width_),
-                                  static_cast<uint32_t>(height_), 1};
-
-  wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
-
-  // Submit commands
-  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
-  WGPUQueue queue = wgpuDeviceGetQueue(device_);
-  wgpuQueueSubmit(queue, 1, &commands);
-  wgpuCommandBufferRelease(commands);
-  wgpuCommandEncoderRelease(encoder);
-
-  // CRITICAL: Wait for GPU work to complete before mapping
-  // Without this, buffer may be destroyed before copy finishes
-  // Note: Skipping wait for now - appears to be causing issues
-  // The buffer mapping will handle synchronization internally
-
-  // Map buffer for reading (API differs between Win32 and native)
-#if defined(DEMO_CROSS_COMPILE_WIN32)
-  // Win32: Old callback API
-  MapState map_state = {};
-  auto map_cb = [](WGPUBufferMapAsyncStatus status, void* userdata) {
-    MapState* state = static_cast<MapState*>(userdata);
-    state->status = status;
-    state->done = true;
-  };
-  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_cb,
-                     &map_state);
+#if !defined(STRIP_ALL)
+  return read_texture_pixels(instance_, device_, texture_, width_, height_);
 #else
-  // Native: New callback info API
-  MapState map_state = {};
-  auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
-                   void* userdata, void* user2) {
-    (void)message;
-    (void)user2;
-    MapState* state = static_cast<MapState*>(userdata);
-    state->status = status;
-    state->done = true;
-  };
-  WGPUBufferMapCallbackInfo map_info = {};
-  map_info.mode = WGPUCallbackMode_WaitAnyOnly;
-  map_info.callback = map_cb;
-  map_info.userdata1 = &map_state;
-  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
-#endif
-
-  // Wait for mapping to complete
-  for (int i = 0; i < 100 && !map_state.done; ++i) {
-#if defined(__EMSCRIPTEN__)
-    emscripten_sleep(10);
-#else
-    wgpuInstanceProcessEvents(instance_);
+  return std::vector<uint8_t>();  // Should never be called in STRIP_ALL builds
 #endif
-  }
-
-  if (map_state.status != WGPUMapAsyncStatus_Success) {
-    fprintf(stderr, "Buffer mapping failed: %d\n", map_state.status);
-    wgpuBufferRelease(staging);
-    return pixels; // Return empty
-  }
-
-  // Copy data from mapped buffer
-  const uint8_t* mapped_data = static_cast<const uint8_t*>(
-      wgpuBufferGetConstMappedRange(staging, 0, buffer_size));
-  if (mapped_data) {
-    memcpy(pixels.data(), mapped_data, buffer_size);
-  }
-
-  // Cleanup
-  wgpuBufferUnmap(staging);
-  wgpuBufferRelease(staging);
-
-  return pixels;
 }
author	skal <pascal.massimino@gmail.com>	2026-02-11 07:07:29 +0100
committer	skal <pascal.massimino@gmail.com>	2026-02-11 07:07:29 +0100
commit	3915a5e1c8c904f8f2154845cb99223a598653ee (patch)
tree	cb0e75dea7f8aa729d3b440a5e81b3ac811f8f04 /src
parent	01e640be66f9d72c22417403eb88e18d6747866f (diff)