// GPU texture readback utility implementation // Extracts texture pixels to CPU memory for offline processing #include "gpu/texture_readback.h" #if !defined(STRIP_ALL) #include #include #include // Callback state for async buffer mapping struct MapState { bool done = false; WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Unknown; }; std::vector read_texture_pixels( WGPUInstance instance, WGPUDevice device, WGPUTexture texture, int width, int height) { // Align bytes per row to 256 (COPY_BYTES_PER_ROW_ALIGNMENT) const uint32_t bytes_per_pixel = 4; // BGRA8 const uint32_t unaligned_bytes_per_row = width * bytes_per_pixel; const uint32_t aligned_bytes_per_row = ((unaligned_bytes_per_row + 255) / 256) * 256; const size_t buffer_size = aligned_bytes_per_row * height; std::vector pixels(width * height * bytes_per_pixel); // Create staging buffer for readback (with aligned size) const WGPUBufferDescriptor buffer_desc = { .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, .size = buffer_size, }; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc); assert(staging && "Failed to create staging buffer"); // Create command encoder for copy operation const WGPUCommandEncoderDescriptor enc_desc = {}; WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, &enc_desc); // Copy texture to buffer const WGPUTexelCopyTextureInfo src = { .texture = texture, .mipLevel = 0, .origin = {0, 0, 0}, }; const WGPUTexelCopyBufferInfo dst = { .buffer = staging, .layout = { .bytesPerRow = aligned_bytes_per_row, .rowsPerImage = static_cast(height), }, }; const WGPUExtent3D copy_size = {static_cast(width), static_cast(height), 1}; wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, ©_size); // Submit commands WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); WGPUQueue queue = wgpuDeviceGetQueue(device); wgpuQueueSubmit(queue, 1, &commands); wgpuCommandBufferRelease(commands); wgpuCommandEncoderRelease(encoder); // Wait for copy to complete before mapping wgpuDevicePoll(device, true, nullptr); // Map buffer for reading (API differs between Win32 and native) #if defined(DEMO_CROSS_COMPILE_WIN32) // Win32: Old callback API MapState map_state = {}; auto map_cb = [](WGPUBufferMapAsyncStatus status, void* userdata) { MapState* state = static_cast(userdata); state->status = status; state->done = true; }; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_cb, &map_state); #else // Native: New callback info API MapState map_state = {}; auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message, void* userdata, void* user2) { (void)message; (void)user2; MapState* state = static_cast(userdata); state->status = status; state->done = true; }; WGPUBufferMapCallbackInfo map_info = {}; map_info.mode = WGPUCallbackMode_AllowProcessEvents; // Fire during ProcessEvents map_info.callback = map_cb; map_info.userdata1 = &map_state; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info); #endif // Wait for mapping to complete (synchronous blocking) for (int i = 0; i < 100 && !map_state.done; ++i) { #if defined(__EMSCRIPTEN__) emscripten_sleep(10); #else wgpuDevicePoll(device, true, nullptr); #endif } if (!map_state.done || map_state.status != WGPUMapAsyncStatus_Success) { wgpuBufferRelease(staging); return pixels; // Return empty on timeout or failure } // Copy data from mapped buffer (handle row padding) const uint8_t* mapped_data = static_cast( wgpuBufferGetConstMappedRange(staging, 0, buffer_size)); if (mapped_data) { // If rows are aligned, copy row by row to remove padding if (aligned_bytes_per_row != unaligned_bytes_per_row) { for (int y = 0; y < height; ++y) { memcpy(pixels.data() + y * unaligned_bytes_per_row, mapped_data + y * aligned_bytes_per_row, unaligned_bytes_per_row); } } else { // No padding, direct copy memcpy(pixels.data(), mapped_data, pixels.size()); } } // Cleanup wgpuBufferUnmap(staging); wgpuBufferRelease(staging); return pixels; } // Half-float (FP16) to float conversion static float fp16_to_float(uint16_t h) { uint32_t sign = (h & 0x8000) << 16; uint32_t exp = (h & 0x7C00) >> 10; uint32_t mant = (h & 0x03FF); if (exp == 0) { if (mant == 0) { // Zero uint32_t bits = sign; float result; memcpy(&result, &bits, sizeof(float)); return result; } // Denormalized exp = 1; while ((mant & 0x400) == 0) { mant <<= 1; exp--; } mant &= 0x3FF; } else if (exp == 31) { // Inf or NaN uint32_t bits = sign | 0x7F800000 | (mant << 13); float result; memcpy(&result, &bits, sizeof(float)); return result; } uint32_t bits = sign | ((exp + 112) << 23) | (mant << 13); float result; memcpy(&result, &bits, sizeof(float)); return result; } std::vector texture_readback_fp16_to_u8( WGPUDevice device, WGPUQueue queue, WGPUTexture texture, int width, int height) { // Align bytes per row to 256 const uint32_t bytes_per_pixel = 8; // RGBA16Float = 4 × 2 bytes const uint32_t unaligned_bytes_per_row = width * bytes_per_pixel; const uint32_t aligned_bytes_per_row = ((unaligned_bytes_per_row + 255) / 256) * 256; const size_t buffer_size = aligned_bytes_per_row * height; // Create staging buffer const WGPUBufferDescriptor buffer_desc = { .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead, .size = buffer_size, }; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc); if (!staging) { return {}; } // Copy texture to buffer WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); const WGPUTexelCopyTextureInfo src = { .texture = texture, .mipLevel = 0, .origin = {0, 0, 0}, }; const WGPUTexelCopyBufferInfo dst = { .buffer = staging, .layout = { .bytesPerRow = aligned_bytes_per_row, .rowsPerImage = static_cast(height), }, }; const WGPUExtent3D copy_size = {static_cast(width), static_cast(height), 1}; wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, ©_size); WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &commands); wgpuCommandBufferRelease(commands); wgpuCommandEncoderRelease(encoder); wgpuDevicePoll(device, true, nullptr); // Map buffer #if defined(DEMO_CROSS_COMPILE_WIN32) MapState map_state = {}; auto map_cb = [](WGPUBufferMapAsyncStatus status, void* userdata) { MapState* state = static_cast(userdata); state->status = status; state->done = true; }; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_cb, &map_state); #else MapState map_state = {}; auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message, void* userdata, void* user2) { (void)message; (void)user2; MapState* state = static_cast(userdata); state->status = status; state->done = true; }; WGPUBufferMapCallbackInfo map_info = {}; map_info.mode = WGPUCallbackMode_AllowProcessEvents; map_info.callback = map_cb; map_info.userdata1 = &map_state; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info); #endif for (int i = 0; i < 100 && !map_state.done; ++i) { wgpuDevicePoll(device, true, nullptr); } if (!map_state.done || map_state.status != WGPUMapAsyncStatus_Success) { wgpuBufferRelease(staging); return {}; } // Convert FP16 to U8 ([-1,1] → [0,255]) const uint16_t* mapped_data = static_cast( wgpuBufferGetConstMappedRange(staging, 0, buffer_size)); std::vector pixels(width * height * 4); if (mapped_data) { for (int y = 0; y < height; ++y) { const uint16_t* src_row = reinterpret_cast( reinterpret_cast(mapped_data) + y * aligned_bytes_per_row); for (int x = 0; x < width; ++x) { float r = fp16_to_float(src_row[x * 4 + 0]); float g = fp16_to_float(src_row[x * 4 + 1]); float b = fp16_to_float(src_row[x * 4 + 2]); float a = fp16_to_float(src_row[x * 4 + 3]); // Convert [-1,1] → [0,1] → [0,255] r = (r + 1.0f) * 0.5f; g = (g + 1.0f) * 0.5f; b = (b + 1.0f) * 0.5f; a = (a + 1.0f) * 0.5f; int idx = (y * width + x) * 4; pixels[idx + 0] = static_cast(b * 255.0f); // B pixels[idx + 1] = static_cast(g * 255.0f); // G pixels[idx + 2] = static_cast(r * 255.0f); // R pixels[idx + 3] = static_cast(a * 255.0f); // A } } } wgpuBufferUnmap(staging); wgpuBufferRelease(staging); return pixels; } #endif // !defined(STRIP_ALL)