add --save-intermediates to train.py and cnn_test

author: skal <pascal.massimino@gmail.com> 2026-02-11 10:51:06 +0100
committer: skal <pascal.massimino@gmail.com> 2026-02-11 10:51:06 +0100
commit: 4da0a3a5369142078fd7c681e3f0f1817bd6e2f3 (patch)
tree: d69429d6800dad0bb819f164122df634543796a5 /src/gpu/texture_readback.cc
parent: 7dd1ac57178055aa8407777d4fb03787e21e6f66 (diff)
1 files changed, 158 insertions, 0 deletions
diff --git a/src/gpu/texture_readback.cc b/src/gpu/texture_readback.cc
index 0eb63d7..f3e4056 100644
--- a/src/gpu/texture_readback.cc
+++ b/src/gpu/texture_readback.cc
@@ -142,4 +142,162 @@ std::vector<uint8_t> read_texture_pixels(
   return pixels;
 }
 
+// Half-float (FP16) to float conversion
+static float fp16_to_float(uint16_t h) {
+  uint32_t sign = (h & 0x8000) << 16;
+  uint32_t exp = (h & 0x7C00) >> 10;
+  uint32_t mant = (h & 0x03FF);
+
+  if (exp == 0) {
+    if (mant == 0) {
+      // Zero
+      uint32_t bits = sign;
+      float result;
+      memcpy(&result, &bits, sizeof(float));
+      return result;
+    }
+    // Denormalized
+    exp = 1;
+    while ((mant & 0x400) == 0) {
+      mant <<= 1;
+      exp--;
+    }
+    mant &= 0x3FF;
+  } else if (exp == 31) {
+    // Inf or NaN
+    uint32_t bits = sign | 0x7F800000 | (mant << 13);
+    float result;
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+  }
+
+  uint32_t bits = sign | ((exp + 112) << 23) | (mant << 13);
+  float result;
+  memcpy(&result, &bits, sizeof(float));
+  return result;
+}
+
+std::vector<uint8_t> texture_readback_fp16_to_u8(
+    WGPUDevice device,
+    WGPUQueue queue,
+    WGPUTexture texture,
+    int width,
+    int height) {
+
+  // Align bytes per row to 256
+  const uint32_t bytes_per_pixel = 8; // RGBA16Float = 4 × 2 bytes
+  const uint32_t unaligned_bytes_per_row = width * bytes_per_pixel;
+  const uint32_t aligned_bytes_per_row =
+      ((unaligned_bytes_per_row + 255) / 256) * 256;
+
+  const size_t buffer_size = aligned_bytes_per_row * height;
+
+  // Create staging buffer
+  const WGPUBufferDescriptor buffer_desc = {
+      .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
+      .size = buffer_size,
+  };
+  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+  if (!staging) {
+    return {};
+  }
+
+  // Copy texture to buffer
+  WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+  const WGPUTexelCopyTextureInfo src = {
+      .texture = texture,
+      .mipLevel = 0,
+      .origin = {0, 0, 0},
+  };
+  const WGPUTexelCopyBufferInfo dst = {
+      .buffer = staging,
+      .layout =
+          {
+              .bytesPerRow = aligned_bytes_per_row,
+              .rowsPerImage = static_cast<uint32_t>(height),
+          },
+  };
+  const WGPUExtent3D copy_size = {static_cast<uint32_t>(width),
+                                  static_cast<uint32_t>(height), 1};
+  wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+
+  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+  wgpuQueueSubmit(queue, 1, &commands);
+  wgpuCommandBufferRelease(commands);
+  wgpuCommandEncoderRelease(encoder);
+  wgpuDevicePoll(device, true, nullptr);
+
+  // Map buffer
+#if defined(DEMO_CROSS_COMPILE_WIN32)
+  MapState map_state = {};
+  auto map_cb = [](WGPUBufferMapAsyncStatus status, void* userdata) {
+    MapState* state = static_cast<MapState*>(userdata);
+    state->status = status;
+    state->done = true;
+  };
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_cb,
+                     &map_state);
+#else
+  MapState map_state = {};
+  auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
+                   void* userdata, void* user2) {
+    (void)message;
+    (void)user2;
+    MapState* state = static_cast<MapState*>(userdata);
+    state->status = status;
+    state->done = true;
+  };
+  WGPUBufferMapCallbackInfo map_info = {};
+  map_info.mode = WGPUCallbackMode_AllowProcessEvents;
+  map_info.callback = map_cb;
+  map_info.userdata1 = &map_state;
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
+#endif
+
+  for (int i = 0; i < 100 && !map_state.done; ++i) {
+    wgpuDevicePoll(device, true, nullptr);
+  }
+
+  if (!map_state.done || map_state.status != WGPUMapAsyncStatus_Success) {
+    wgpuBufferRelease(staging);
+    return {};
+  }
+
+  // Convert FP16 to U8 ([-1,1] → [0,255])
+  const uint16_t* mapped_data = static_cast<const uint16_t*>(
+      wgpuBufferGetConstMappedRange(staging, 0, buffer_size));
+
+  std::vector<uint8_t> pixels(width * height * 4);
+  if (mapped_data) {
+    for (int y = 0; y < height; ++y) {
+      const uint16_t* src_row =
+          reinterpret_cast<const uint16_t*>(
+              reinterpret_cast<const uint8_t*>(mapped_data) +
+              y * aligned_bytes_per_row);
+      for (int x = 0; x < width; ++x) {
+        float r = fp16_to_float(src_row[x * 4 + 0]);
+        float g = fp16_to_float(src_row[x * 4 + 1]);
+        float b = fp16_to_float(src_row[x * 4 + 2]);
+        float a = fp16_to_float(src_row[x * 4 + 3]);
+
+        // Convert [-1,1] → [0,1] → [0,255]
+        r = (r + 1.0f) * 0.5f;
+        g = (g + 1.0f) * 0.5f;
+        b = (b + 1.0f) * 0.5f;
+        a = (a + 1.0f) * 0.5f;
+
+        int idx = (y * width + x) * 4;
+        pixels[idx + 0] = static_cast<uint8_t>(b * 255.0f); // B
+        pixels[idx + 1] = static_cast<uint8_t>(g * 255.0f); // G
+        pixels[idx + 2] = static_cast<uint8_t>(r * 255.0f); // R
+        pixels[idx + 3] = static_cast<uint8_t>(a * 255.0f); // A
+      }
+    }
+  }
+
+  wgpuBufferUnmap(staging);
+  wgpuBufferRelease(staging);
+  return pixels;
+}
+
 #endif // !defined(STRIP_ALL)
author	skal <pascal.massimino@gmail.com>	2026-02-11 10:51:06 +0100
committer	skal <pascal.massimino@gmail.com>	2026-02-11 10:51:06 +0100
commit	4da0a3a5369142078fd7c681e3f0f1817bd6e2f3 (patch)
tree	d69429d6800dad0bb819f164122df634543796a5 /src/gpu/texture_readback.cc
parent	7dd1ac57178055aa8407777d4fb03787e21e6f66 (diff)