summaryrefslogtreecommitdiff
path: root/src/gpu/texture_readback.cc
diff options
context:
space:
mode:
authorskal <pascal.massimino@gmail.com>2026-02-11 10:51:06 +0100
committerskal <pascal.massimino@gmail.com>2026-02-11 10:51:06 +0100
commit4da0a3a5369142078fd7c681e3f0f1817bd6e2f3 (patch)
treed69429d6800dad0bb819f164122df634543796a5 /src/gpu/texture_readback.cc
parent7dd1ac57178055aa8407777d4fb03787e21e6f66 (diff)
add --save-intermediates to train.py and cnn_test
Diffstat (limited to 'src/gpu/texture_readback.cc')
-rw-r--r--src/gpu/texture_readback.cc158
1 files changed, 158 insertions, 0 deletions
diff --git a/src/gpu/texture_readback.cc b/src/gpu/texture_readback.cc
index 0eb63d7..f3e4056 100644
--- a/src/gpu/texture_readback.cc
+++ b/src/gpu/texture_readback.cc
@@ -142,4 +142,162 @@ std::vector<uint8_t> read_texture_pixels(
return pixels;
}
+// Half-float (FP16) to float conversion
+static float fp16_to_float(uint16_t h) {
+ uint32_t sign = (h & 0x8000) << 16;
+ uint32_t exp = (h & 0x7C00) >> 10;
+ uint32_t mant = (h & 0x03FF);
+
+ if (exp == 0) {
+ if (mant == 0) {
+ // Zero
+ uint32_t bits = sign;
+ float result;
+ memcpy(&result, &bits, sizeof(float));
+ return result;
+ }
+ // Denormalized
+ exp = 1;
+ while ((mant & 0x400) == 0) {
+ mant <<= 1;
+ exp--;
+ }
+ mant &= 0x3FF;
+ } else if (exp == 31) {
+ // Inf or NaN
+ uint32_t bits = sign | 0x7F800000 | (mant << 13);
+ float result;
+ memcpy(&result, &bits, sizeof(float));
+ return result;
+ }
+
+ uint32_t bits = sign | ((exp + 112) << 23) | (mant << 13);
+ float result;
+ memcpy(&result, &bits, sizeof(float));
+ return result;
+}
+
+std::vector<uint8_t> texture_readback_fp16_to_u8(
+ WGPUDevice device,
+ WGPUQueue queue,
+ WGPUTexture texture,
+ int width,
+ int height) {
+
+ // Align bytes per row to 256
+ const uint32_t bytes_per_pixel = 8; // RGBA16Float = 4 × 2 bytes
+ const uint32_t unaligned_bytes_per_row = width * bytes_per_pixel;
+ const uint32_t aligned_bytes_per_row =
+ ((unaligned_bytes_per_row + 255) / 256) * 256;
+
+ const size_t buffer_size = aligned_bytes_per_row * height;
+
+ // Create staging buffer
+ const WGPUBufferDescriptor buffer_desc = {
+ .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
+ .size = buffer_size,
+ };
+ WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+ if (!staging) {
+ return {};
+ }
+
+ // Copy texture to buffer
+ WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+ const WGPUTexelCopyTextureInfo src = {
+ .texture = texture,
+ .mipLevel = 0,
+ .origin = {0, 0, 0},
+ };
+ const WGPUTexelCopyBufferInfo dst = {
+ .buffer = staging,
+ .layout =
+ {
+ .bytesPerRow = aligned_bytes_per_row,
+ .rowsPerImage = static_cast<uint32_t>(height),
+ },
+ };
+ const WGPUExtent3D copy_size = {static_cast<uint32_t>(width),
+ static_cast<uint32_t>(height), 1};
+ wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+
+ WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+ wgpuQueueSubmit(queue, 1, &commands);
+ wgpuCommandBufferRelease(commands);
+ wgpuCommandEncoderRelease(encoder);
+ wgpuDevicePoll(device, true, nullptr);
+
+ // Map buffer
+#if defined(DEMO_CROSS_COMPILE_WIN32)
+ MapState map_state = {};
+ auto map_cb = [](WGPUBufferMapAsyncStatus status, void* userdata) {
+ MapState* state = static_cast<MapState*>(userdata);
+ state->status = status;
+ state->done = true;
+ };
+ wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_cb,
+ &map_state);
+#else
+ MapState map_state = {};
+ auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
+ void* userdata, void* user2) {
+ (void)message;
+ (void)user2;
+ MapState* state = static_cast<MapState*>(userdata);
+ state->status = status;
+ state->done = true;
+ };
+ WGPUBufferMapCallbackInfo map_info = {};
+ map_info.mode = WGPUCallbackMode_AllowProcessEvents;
+ map_info.callback = map_cb;
+ map_info.userdata1 = &map_state;
+ wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
+#endif
+
+ for (int i = 0; i < 100 && !map_state.done; ++i) {
+ wgpuDevicePoll(device, true, nullptr);
+ }
+
+ if (!map_state.done || map_state.status != WGPUMapAsyncStatus_Success) {
+ wgpuBufferRelease(staging);
+ return {};
+ }
+
+ // Convert FP16 to U8 ([-1,1] → [0,255])
+ const uint16_t* mapped_data = static_cast<const uint16_t*>(
+ wgpuBufferGetConstMappedRange(staging, 0, buffer_size));
+
+ std::vector<uint8_t> pixels(width * height * 4);
+ if (mapped_data) {
+ for (int y = 0; y < height; ++y) {
+ const uint16_t* src_row =
+ reinterpret_cast<const uint16_t*>(
+ reinterpret_cast<const uint8_t*>(mapped_data) +
+ y * aligned_bytes_per_row);
+ for (int x = 0; x < width; ++x) {
+ float r = fp16_to_float(src_row[x * 4 + 0]);
+ float g = fp16_to_float(src_row[x * 4 + 1]);
+ float b = fp16_to_float(src_row[x * 4 + 2]);
+ float a = fp16_to_float(src_row[x * 4 + 3]);
+
+ // Convert [-1,1] → [0,1] → [0,255]
+ r = (r + 1.0f) * 0.5f;
+ g = (g + 1.0f) * 0.5f;
+ b = (b + 1.0f) * 0.5f;
+ a = (a + 1.0f) * 0.5f;
+
+ int idx = (y * width + x) * 4;
+ pixels[idx + 0] = static_cast<uint8_t>(b * 255.0f); // B
+ pixels[idx + 1] = static_cast<uint8_t>(g * 255.0f); // G
+ pixels[idx + 2] = static_cast<uint8_t>(r * 255.0f); // R
+ pixels[idx + 3] = static_cast<uint8_t>(a * 255.0f); // A
+ }
+ }
+ }
+
+ wgpuBufferUnmap(staging);
+ wgpuBufferRelease(staging);
+ return pixels;
+}
+
#endif // !defined(STRIP_ALL)