// CNN shader testing tool for offline validation // Tests trained CNN shaders on input PNG with GPU readback #if defined(STRIP_ALL) #error "cnn_test requires STRIP_ALL=OFF (tool builds only)" #endif #include "platform/platform.h" #include "gpu/gpu.h" #include "gpu/bind_group_builder.h" #include "gpu/pipeline_builder.h" #include "gpu/sampler_cache.h" #include "gpu/texture_readback.h" #include "gpu/effects/post_process_helper.h" #include "gpu/effects/cnn_effect.h" #include "gpu/effects/shader_composer.h" #include "gpu/effects/shaders.h" #include "tests/common/webgpu_test_fixture.h" #include "tests/common/offscreen_render_target.h" #include "generated/assets.h" #include "util/asset_manager.h" #include "util/mini_math.h" #include "stb_image.h" #include "wgpu-native/examples/capture/stb_image_write.h" #include #include #include #include #include // Helper to get asset string or empty string static const char* SafeGetAsset(AssetId id) { const uint8_t* data = GetAsset(id); return data ? (const char*)data : ""; } // Command-line arguments struct Args { const char* input_path = nullptr; const char* output_path = nullptr; float blend = 1.0f; bool output_png = true; // Default to PNG const char* save_intermediates = nullptr; int num_layers = 3; // Default to 3 layers bool debug_hex = false; // Print first 8 pixels as hex int cnn_version = 1; // 1=CNNEffect, 2=CNNv2Effect const char* weights_path = nullptr; // Optional .bin weights file bool cnn_version_explicit = false; // Track if --cnn-version was explicitly set }; // Parse command-line arguments static bool parse_args(int argc, char** argv, Args* args) { if (argc < 3) { return false; } args->input_path = argv[1]; args->output_path = argv[2]; for (int i = 3; i < argc; ++i) { if (strcmp(argv[i], "--blend") == 0 && i + 1 < argc) { args->blend = atof(argv[++i]); if (args->blend < 0.0f || args->blend > 1.0f) { fprintf(stderr, "Error: blend must be in range [0.0, 1.0]\n"); return false; } } else if (strcmp(argv[i], "--format") == 0 && i + 1 < argc) { ++i; if (strcmp(argv[i], "ppm") == 0) { args->output_png = false; } else if (strcmp(argv[i], "png") == 0) { args->output_png = true; } else { fprintf(stderr, "Error: unknown format '%s' (use 'png' or 'ppm')\n", argv[i]); return false; } } else if (strcmp(argv[i], "--save-intermediates") == 0 && i + 1 < argc) { args->save_intermediates = argv[++i]; } else if (strcmp(argv[i], "--layers") == 0 && i + 1 < argc) { args->num_layers = atoi(argv[++i]); if (args->num_layers < 1 || args->num_layers > 10) { fprintf(stderr, "Error: layers must be in range [1, 10]\n"); return false; } } else if (strcmp(argv[i], "--debug-hex") == 0) { args->debug_hex = true; } else if (strcmp(argv[i], "--cnn-version") == 0 && i + 1 < argc) { args->cnn_version = atoi(argv[++i]); args->cnn_version_explicit = true; if (args->cnn_version < 1 || args->cnn_version > 2) { fprintf(stderr, "Error: cnn-version must be 1 or 2\n"); return false; } } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) { args->weights_path = argv[++i]; } else if (strcmp(argv[i], "--help") == 0) { return false; } else { fprintf(stderr, "Error: unknown option '%s'\n", argv[i]); return false; } } // Force CNN v2 when --weights is specified if (args->weights_path) { if (args->cnn_version_explicit && args->cnn_version != 2) { fprintf(stderr, "WARNING: --cnn-version %d ignored (--weights forces CNN v2)\n", args->cnn_version); } args->cnn_version = 2; // Warn if --layers was specified (binary file config takes precedence) if (args->num_layers != 3) { // 3 is the default fprintf(stderr, "WARNING: --layers %d ignored (--weights loads layer config from .bin)\n", args->num_layers); } } return true; } // Print usage static void print_usage(const char* prog) { fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog); fprintf(stderr, "\nOPTIONS:\n"); fprintf(stderr, " --blend F Final blend amount (0.0-1.0, default: 1.0)\n"); fprintf(stderr, " --format ppm|png Output format (default: png)\n"); fprintf(stderr, " --layers N Number of CNN layers (1-10, default: 3, ignored with --weights)\n"); fprintf(stderr, " --save-intermediates DIR Save intermediate layers to directory\n"); fprintf(stderr, " --debug-hex Print first 8 pixels as hex (debug)\n"); fprintf(stderr, " --cnn-version N CNN version: 1 (default) or 2 (ignored with --weights)\n"); fprintf(stderr, " --weights PATH Load weights from .bin (forces CNN v2, overrides layer config)\n"); fprintf(stderr, " --help Show this help\n"); } // Load PNG and upload to GPU texture static WGPUTexture load_texture(WGPUDevice device, WGPUQueue queue, const char* path, int* out_width, int* out_height) { int width, height, channels; uint8_t* data = stbi_load(path, &width, &height, &channels, 4); if (!data) { fprintf(stderr, "Error: failed to load image '%s'\n", path); return nullptr; } *out_width = width; *out_height = height; // Create texture const WGPUTextureDescriptor texture_desc = { .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst | WGPUTextureUsage_RenderAttachment, .dimension = WGPUTextureDimension_2D, .size = {static_cast(width), static_cast(height), 1}, .format = WGPUTextureFormat_BGRA8Unorm, .mipLevelCount = 1, .sampleCount = 1, }; WGPUTexture texture = wgpuDeviceCreateTexture(device, &texture_desc); if (!texture) { fprintf(stderr, "Error: failed to create texture\n"); stbi_image_free(data); return nullptr; } // Convert RGBA → BGRA std::vector bgra_data(width * height * 4); for (int i = 0; i < width * height; ++i) { bgra_data[i * 4 + 0] = data[i * 4 + 2]; // B bgra_data[i * 4 + 1] = data[i * 4 + 1]; // G bgra_data[i * 4 + 2] = data[i * 4 + 0]; // R bgra_data[i * 4 + 3] = data[i * 4 + 3]; // A } // Upload to GPU const WGPUTexelCopyTextureInfo dst = {.texture = texture, .mipLevel = 0}; const WGPUTexelCopyBufferLayout layout = { .bytesPerRow = static_cast(width * 4), .rowsPerImage = static_cast(height)}; const WGPUExtent3D size = {static_cast(width), static_cast(height), 1}; wgpuQueueWriteTexture(queue, &dst, bgra_data.data(), bgra_data.size(), &layout, &size); stbi_image_free(data); return texture; } // Load PNG alpha channel as depth texture (or 1.0 if no alpha) static WGPUTexture load_depth_from_alpha(WGPUDevice device, WGPUQueue queue, const char* path, int width, int height) { int w, h, channels; uint8_t* data = stbi_load(path, &w, &h, &channels, 4); if (!data || w != width || h != height) { fprintf(stderr, "Error: failed to load depth from '%s'\n", path); if (data) stbi_image_free(data); return nullptr; } // Extract alpha channel (or use 1.0 if original was RGB) std::vector depth_data(width * height); bool has_alpha = (channels == 4); for (int i = 0; i < width * height; ++i) { // Alpha is in data[i*4+3] (0-255), convert to float [0, 1] // If no alpha channel, default to 1.0 (far plane) depth_data[i] = has_alpha ? (data[i * 4 + 3] / 255.0f) : 1.0f; } stbi_image_free(data); // Create R32Float depth texture const WGPUTextureDescriptor depth_desc = { .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst, .dimension = WGPUTextureDimension_2D, .size = {static_cast(width), static_cast(height), 1}, .format = WGPUTextureFormat_R32Float, .mipLevelCount = 1, .sampleCount = 1, }; WGPUTexture depth_texture = wgpuDeviceCreateTexture(device, &depth_desc); if (!depth_texture) { fprintf(stderr, "Error: failed to create depth texture\n"); return nullptr; } // Write depth data const WGPUTexelCopyTextureInfo dst = { .texture = depth_texture, .mipLevel = 0 }; const WGPUTexelCopyBufferLayout layout = { .bytesPerRow = static_cast(width * sizeof(float)), .rowsPerImage = static_cast(height) }; const WGPUExtent3D size = { static_cast(width), static_cast(height), 1 }; wgpuQueueWriteTexture(queue, &dst, depth_data.data(), depth_data.size() * sizeof(float), &layout, &size); printf("Loaded depth from alpha: %dx%d (%s alpha)\n", width, height, has_alpha ? "has" : "no"); return depth_texture; } // Create CNN render pipeline (5 bindings) // Takes both intermediate format (RGBA16Float) and final format (BGRA8Unorm) static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device, WGPUTextureFormat format, bool is_final_layer) { const char* shader_code = SafeGetAsset(AssetId::ASSET_SHADER_CNN_LAYER); // Debug: check if shader loaded if (!shader_code || shader_code[0] == '\0') { fprintf(stderr, "ERROR: CNN shader asset not loaded!\n"); return nullptr; } printf("Loaded CNN shader: %zu bytes\n", strlen(shader_code)); WGPUBindGroupLayout bgl = BindGroupLayoutBuilder() .sampler(0, WGPUShaderStage_Fragment) .texture(1, WGPUShaderStage_Fragment) .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment) .uniform(3, WGPUShaderStage_Fragment) .texture(4, WGPUShaderStage_Fragment) // Original input .build(device); // Use appropriate format: RGBA16Float for intermediate, BGRA8Unorm for final WGPUTextureFormat output_format = is_final_layer ? WGPUTextureFormat_BGRA8Unorm : WGPUTextureFormat_RGBA16Float; WGPURenderPipeline pipeline = RenderPipelineBuilder(device) .shader(shader_code) // compose=true by default .bind_group_layout(bgl) .format(output_format) .build(); wgpuBindGroupLayoutRelease(bgl); return pipeline; } // Begin render pass with clear static WGPURenderPassEncoder begin_render_pass(WGPUCommandEncoder encoder, WGPUTextureView view) { const WGPURenderPassColorAttachment color_attachment = { .view = view, .depthSlice = WGPU_DEPTH_SLICE_UNDEFINED, .loadOp = WGPULoadOp_Clear, .storeOp = WGPUStoreOp_Store, .clearValue = {0.0f, 0.0f, 0.0f, 1.0f}, }; const WGPURenderPassDescriptor pass_desc = { .colorAttachmentCount = 1, .colorAttachments = &color_attachment, }; return wgpuCommandEncoderBeginRenderPass(encoder, &pass_desc); } // Save PNG output static bool save_png(const char* path, const std::vector& pixels, int width, int height) { // Convert BGRA → RGBA std::vector rgba(width * height * 4); for (int i = 0; i < width * height; ++i) { rgba[i * 4 + 0] = pixels[i * 4 + 2]; // R rgba[i * 4 + 1] = pixels[i * 4 + 1]; // G rgba[i * 4 + 2] = pixels[i * 4 + 0]; // B rgba[i * 4 + 3] = pixels[i * 4 + 3]; // A } if (!stbi_write_png(path, width, height, 4, rgba.data(), width * 4)) { fprintf(stderr, "Error: failed to write PNG '%s'\n", path); return false; } return true; } // Create horizontal grayscale composite of layer outputs // Each layer is already 4x wide (showing 4 channels), stack them vertically static bool save_layer_composite(const char* dir, int width, int height, int num_layers) { // Each layer PNG is already 4x wide with 4 channels side-by-side int layer_width = width * 4; // Load all layer images (they're already grayscale) std::vector> layers(num_layers); for (int i = 0; i < num_layers; ++i) { char path[512]; snprintf(path, sizeof(path), "%s/layer_%d.png", dir, i); int w, h, channels; uint8_t* data = stbi_load(path, &w, &h, &channels, 1); // Load as grayscale if (!data || w != layer_width || h != height) { if (data) stbi_image_free(data); fprintf(stderr, "Warning: failed to load layer %d for composite (expected %dx%d, got %dx%d)\n", i, layer_width, height, w, h); return false; } layers[i].assign(data, data + (layer_width * height)); stbi_image_free(data); } // Stack layers vertically int composite_height = height * num_layers; std::vector composite(layer_width * composite_height); for (int layer = 0; layer < num_layers; ++layer) { for (int y = 0; y < height; ++y) { int src_row_offset = y * layer_width; int dst_row_offset = (layer * height + y) * layer_width; memcpy(&composite[dst_row_offset], &layers[layer][src_row_offset], layer_width); } } // Save as grayscale PNG (stacked vertically) char composite_path[512]; snprintf(composite_path, sizeof(composite_path), "%s/layers_composite.png", dir); if (!stbi_write_png(composite_path, layer_width, composite_height, 1, composite.data(), layer_width)) { fprintf(stderr, "Error: failed to write composite PNG\n"); return false; } printf("Saved layer composite to '%s' (%dx%d, 4 layers stacked vertically)\n", composite_path, layer_width, composite_height); return true; } // Save PPM output (fallback) static bool save_ppm(const char* path, const std::vector& pixels, int width, int height) { FILE* f = fopen(path, "wb"); if (!f) { fprintf(stderr, "Error: failed to open '%s' for writing\n", path); return false; } fprintf(f, "P6\n%d %d\n255\n", width, height); for (int i = 0; i < width * height; ++i) { const uint8_t rgb[3] = {pixels[i * 4 + 2], // R pixels[i * 4 + 1], // G pixels[i * 4 + 0]}; // B fwrite(rgb, 1, 3, f); } fclose(f); return true; } // CNN v2 structures (matching CNNv2Effect) struct CNNv2LayerInfo { uint32_t kernel_size; uint32_t in_channels; uint32_t out_channels; uint32_t weight_offset; uint32_t weight_count; }; struct CNNv2LayerParams { uint32_t kernel_size; uint32_t in_channels; uint32_t out_channels; uint32_t weight_offset; uint32_t is_output_layer; float blend_amount; uint32_t is_layer_0; }; struct CNNv2StaticFeatureParams { uint32_t mip_level; uint32_t padding[3]; }; // Convert RGBA32Uint (packed f16) texture to BGRA8Unorm static std::vector readback_rgba32uint_to_bgra8( WGPUDevice device, WGPUQueue queue, WGPUTexture texture, int width, int height) { // Create staging buffer const uint32_t bytes_per_row = width * 16; // 4×u32 per pixel const uint32_t padded_bytes_per_row = (bytes_per_row + 255) & ~255; const size_t buffer_size = padded_bytes_per_row * height; WGPUBufferDescriptor buffer_desc = {}; buffer_desc.size = buffer_size; buffer_desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; buffer_desc.mappedAtCreation = false; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc); // Copy texture to buffer WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); WGPUTexelCopyTextureInfo src = {}; src.texture = texture; src.mipLevel = 0; WGPUTexelCopyBufferInfo dst = {}; dst.buffer = staging; dst.layout.bytesPerRow = padded_bytes_per_row; dst.layout.rowsPerImage = height; WGPUExtent3D copy_size = { static_cast(width), static_cast(height), 1}; wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, ©_size); WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &commands); wgpuCommandBufferRelease(commands); wgpuCommandEncoderRelease(encoder); // Wait for copy to complete wgpuDevicePoll(device, true, nullptr); // Map and read buffer struct MapState { bool done = false; }; MapState map_state; auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message, void* userdata1, void* userdata2) { (void)message; (void)userdata2; MapState* state = (MapState*)userdata1; state->done = (status == WGPUMapAsyncStatus_Success); }; WGPUBufferMapCallbackInfo map_info = {}; map_info.mode = WGPUCallbackMode_AllowProcessEvents; map_info.callback = map_cb; map_info.userdata1 = &map_state; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info); // Wait for mapping to complete for (int i = 0; i < 100 && !map_state.done; ++i) { wgpuDevicePoll(device, true, nullptr); } if (!map_state.done) { fprintf(stderr, "Error: Buffer mapping timed out\n"); wgpuBufferRelease(staging); return std::vector(); } const uint32_t* mapped = (const uint32_t*)wgpuBufferGetConstMappedRange(staging, 0, buffer_size); std::vector result(width * height * 4); // Unpack f16 to u8 (BGRA) for (int y = 0; y < height; ++y) { const uint32_t* row = (const uint32_t*)((const uint8_t*)mapped + y * padded_bytes_per_row); for (int x = 0; x < width; ++x) { // Read 4×u32 (8×f16) uint32_t data[4]; data[0] = row[x * 4 + 0]; data[1] = row[x * 4 + 1]; data[2] = row[x * 4 + 2]; data[3] = row[x * 4 + 3]; // Extract RGBA channels (first 4 f16 values) uint16_t r16 = data[0] & 0xFFFF; uint16_t g16 = (data[0] >> 16) & 0xFFFF; uint16_t b16 = data[1] & 0xFFFF; uint16_t a16 = (data[1] >> 16) & 0xFFFF; // Convert f16 to f32 (simple decode) auto f16_to_f32 = [](uint16_t h) -> float { uint32_t sign = (h >> 15) & 1; uint32_t exp = (h >> 10) & 0x1F; uint32_t frac = h & 0x3FF; if (exp == 0) { if (frac == 0) return sign ? -0.0f : 0.0f; // Denormal float val = frac / 1024.0f / 16384.0f; return sign ? -val : val; } if (exp == 31) { return frac ? NAN : (sign ? -INFINITY : INFINITY); } int32_t e = exp - 15; float val = (1.0f + frac / 1024.0f) * powf(2.0f, e); return sign ? -val : val; }; float r = f16_to_f32(r16); float g = f16_to_f32(g16); float b = f16_to_f32(b16); float a = f16_to_f32(a16); // Clamp to [0,1] and convert to u8 auto clamp_u8 = [](float v) -> uint8_t { if (v <= 0.0f) return 0; if (v >= 1.0f) return 255; return static_cast(v * 255.0f + 0.5f); }; result[(y * width + x) * 4 + 0] = clamp_u8(b); result[(y * width + x) * 4 + 1] = clamp_u8(g); result[(y * width + x) * 4 + 2] = clamp_u8(r); result[(y * width + x) * 4 + 3] = clamp_u8(a); } } wgpuBufferUnmap(staging); wgpuBufferRelease(staging); return result; } // Read RGBA32Uint and create 4x wide grayscale composite (each channel side-by-side) static std::vector readback_rgba32uint_to_composite( WGPUDevice device, WGPUQueue queue, WGPUTexture texture, int width, int height) { // First get BGRA8 data std::vector bgra = readback_rgba32uint_to_bgra8(device, queue, texture, width, height); if (bgra.empty()) return {}; // Create 4x wide grayscale image (one channel per horizontal strip) int composite_width = width * 4; std::vector composite(composite_width * height); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { int src_idx = (y * width + x) * 4; uint8_t b = bgra[src_idx + 0]; uint8_t g = bgra[src_idx + 1]; uint8_t r = bgra[src_idx + 2]; uint8_t a = bgra[src_idx + 3]; // Convert each channel to grayscale luminance auto to_gray = [](uint8_t val) -> uint8_t { return val; }; // Place each channel in its horizontal strip composite[y * composite_width + (0 * width + x)] = to_gray(r); // Channel 0 composite[y * composite_width + (1 * width + x)] = to_gray(g); // Channel 1 composite[y * composite_width + (2 * width + x)] = to_gray(b); // Channel 2 composite[y * composite_width + (3 * width + x)] = to_gray(a); // Channel 3 } } return composite; } // Process image with CNN v2 static bool process_cnn_v2(WGPUDevice device, WGPUQueue queue, WGPUInstance instance, WGPUTexture input_texture, int width, int height, const Args& args) { printf("Using CNN v2 (storage buffer architecture)\n"); // Load weights (from file or asset system) size_t weights_size = 0; const uint8_t* weights_data = nullptr; std::vector file_weights; // For file-based loading if (args.weights_path) { // Load from file printf("Loading weights from '%s'...\n", args.weights_path); FILE* f = fopen(args.weights_path, "rb"); if (!f) { fprintf(stderr, "Error: failed to open weights file '%s'\n", args.weights_path); return false; } fseek(f, 0, SEEK_END); weights_size = ftell(f); fseek(f, 0, SEEK_SET); file_weights.resize(weights_size); size_t read = fread(file_weights.data(), 1, weights_size, f); fclose(f); if (read != weights_size) { fprintf(stderr, "Error: failed to read weights file\n"); return false; } weights_data = file_weights.data(); } else { // Load from asset system weights_data = (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size); } if (!weights_data || weights_size < 20) { fprintf(stderr, "Error: CNN v2 weights not available\n"); return false; } // Parse header const uint32_t* header = (const uint32_t*)weights_data; uint32_t magic = header[0]; uint32_t version = header[1]; uint32_t num_layers = header[2]; uint32_t total_weights = header[3]; if (magic != 0x324e4e43) { // 'CNN2' fprintf(stderr, "Error: Invalid CNN v2 weights magic\n"); return false; } uint32_t mip_level = 0; if (version == 2) { mip_level = header[4]; } printf("Loaded CNN v2 weights: %u layers, %u weights, version %u\n", num_layers, total_weights, version); // Parse layer info const uint32_t header_u32_count = (version == 1) ? 4 : 5; const uint32_t* layer_data = header + header_u32_count; std::vector layer_info; for (uint32_t i = 0; i < num_layers; ++i) { CNNv2LayerInfo info; info.kernel_size = layer_data[i * 5 + 0]; info.in_channels = layer_data[i * 5 + 1]; info.out_channels = layer_data[i * 5 + 2]; info.weight_offset = layer_data[i * 5 + 3]; info.weight_count = layer_data[i * 5 + 4]; layer_info.push_back(info); printf(" Layer %u: %ux%u conv, %u→%u channels, %u weights\n", i, info.kernel_size, info.kernel_size, info.in_channels, info.out_channels, info.weight_count); } // Create weights storage buffer (skip header + layer info, upload only weights) size_t header_size = 20; // 5 u32 size_t layer_info_size = 20 * layer_info.size(); // 5 u32 per layer size_t weights_offset = header_size + layer_info_size; size_t weights_only_size = weights_size - weights_offset; WGPUBufferDescriptor weights_buffer_desc = {}; weights_buffer_desc.size = weights_only_size; weights_buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst; weights_buffer_desc.mappedAtCreation = false; WGPUBuffer weights_buffer = wgpuDeviceCreateBuffer(device, &weights_buffer_desc); wgpuQueueWriteBuffer(queue, weights_buffer, 0, weights_data + weights_offset, weights_only_size); // Create input view const WGPUTextureViewDescriptor view_desc = { .format = WGPUTextureFormat_BGRA8Unorm, .dimension = WGPUTextureViewDimension_2D, .baseMipLevel = 0, .mipLevelCount = 1, .baseArrayLayer = 0, .arrayLayerCount = 1, }; WGPUTextureView input_view = wgpuTextureCreateView(input_texture, &view_desc); // Create static features texture (RGBA32Uint) const WGPUTextureDescriptor static_desc = { .usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc, .dimension = WGPUTextureDimension_2D, .size = {static_cast(width), static_cast(height), 1}, .format = WGPUTextureFormat_RGBA32Uint, .mipLevelCount = 1, .sampleCount = 1, }; WGPUTexture static_features_tex = wgpuDeviceCreateTexture(device, &static_desc); WGPUTextureView static_features_view = wgpuTextureCreateView(static_features_tex, nullptr); // Load depth from input alpha channel (or 1.0 if no alpha) WGPUTexture depth_texture = load_depth_from_alpha(device, queue, args.input_path, width, height); if (!depth_texture) { wgpuTextureViewRelease(static_features_view); wgpuTextureRelease(static_features_tex); wgpuBufferRelease(weights_buffer); wgpuTextureViewRelease(input_view); return false; } WGPUTextureView depth_view = wgpuTextureCreateView(depth_texture, nullptr); // Create layer textures (ping-pong) WGPUTexture layer_textures[2] = { wgpuDeviceCreateTexture(device, &static_desc), wgpuDeviceCreateTexture(device, &static_desc), }; WGPUTextureView layer_views[2] = { wgpuTextureCreateView(layer_textures[0], nullptr), wgpuTextureCreateView(layer_textures[1], nullptr), }; // Load shaders const char* static_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC); const char* layer_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE); if (!static_shader[0] || !layer_shader[0]) { fprintf(stderr, "Error: CNN v2 shaders not available\n"); wgpuTextureViewRelease(static_features_view); wgpuTextureRelease(static_features_tex); wgpuTextureViewRelease(depth_view); wgpuTextureRelease(depth_texture); wgpuTextureViewRelease(layer_views[0]); wgpuTextureViewRelease(layer_views[1]); wgpuTextureRelease(layer_textures[0]); wgpuTextureRelease(layer_textures[1]); wgpuBufferRelease(weights_buffer); wgpuTextureViewRelease(input_view); return false; } // Create static feature params buffer WGPUBufferDescriptor static_params_desc = {}; static_params_desc.size = sizeof(CNNv2StaticFeatureParams); static_params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; static_params_desc.mappedAtCreation = false; WGPUBuffer static_params_buffer = wgpuDeviceCreateBuffer(device, &static_params_desc); CNNv2StaticFeatureParams static_params; static_params.mip_level = mip_level; static_params.padding[0] = 0; static_params.padding[1] = 0; static_params.padding[2] = 0; wgpuQueueWriteBuffer(queue, static_params_buffer, 0, &static_params, sizeof(static_params)); // Create static features compute pipeline WGPUShaderSourceWGSL static_wgsl = {}; static_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL; static_wgsl.code = str_view(static_shader); WGPUShaderModuleDescriptor static_module_desc = {}; static_module_desc.nextInChain = &static_wgsl.chain; WGPUShaderModule static_module = wgpuDeviceCreateShaderModule(device, &static_module_desc); // Bind group layout: 0=input, 1=input_mip1, 2=input_mip2, 3=depth, 4=output, // 5=params WGPUBindGroupLayoutEntry static_bgl_entries[6] = {}; static_bgl_entries[0].binding = 0; static_bgl_entries[0].visibility = WGPUShaderStage_Compute; static_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float; static_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D; static_bgl_entries[1].binding = 1; static_bgl_entries[1].visibility = WGPUShaderStage_Compute; static_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float; static_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D; static_bgl_entries[2].binding = 2; static_bgl_entries[2].visibility = WGPUShaderStage_Compute; static_bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float; static_bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D; static_bgl_entries[3].binding = 3; static_bgl_entries[3].visibility = WGPUShaderStage_Compute; static_bgl_entries[3].texture.sampleType = WGPUTextureSampleType_UnfilterableFloat; static_bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D; static_bgl_entries[4].binding = 4; static_bgl_entries[4].visibility = WGPUShaderStage_Compute; static_bgl_entries[4].storageTexture.access = WGPUStorageTextureAccess_WriteOnly; static_bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint; static_bgl_entries[4].storageTexture.viewDimension = WGPUTextureViewDimension_2D; static_bgl_entries[5].binding = 5; static_bgl_entries[5].visibility = WGPUShaderStage_Compute; static_bgl_entries[5].buffer.type = WGPUBufferBindingType_Uniform; static_bgl_entries[5].buffer.minBindingSize = sizeof(CNNv2StaticFeatureParams); WGPUBindGroupLayoutDescriptor static_bgl_desc = {}; static_bgl_desc.entryCount = 6; static_bgl_desc.entries = static_bgl_entries; WGPUBindGroupLayout static_bgl = wgpuDeviceCreateBindGroupLayout(device, &static_bgl_desc); WGPUPipelineLayoutDescriptor static_pl_desc = {}; static_pl_desc.bindGroupLayoutCount = 1; static_pl_desc.bindGroupLayouts = &static_bgl; WGPUPipelineLayout static_pl = wgpuDeviceCreatePipelineLayout(device, &static_pl_desc); WGPUComputePipelineDescriptor static_pipeline_desc = {}; static_pipeline_desc.compute.module = static_module; static_pipeline_desc.compute.entryPoint = str_view("main"); static_pipeline_desc.layout = static_pl; WGPUComputePipeline static_pipeline = wgpuDeviceCreateComputePipeline(device, &static_pipeline_desc); wgpuShaderModuleRelease(static_module); wgpuPipelineLayoutRelease(static_pl); // Create static bind group (use input as all mips for simplicity) WGPUBindGroupEntry static_bg_entries[6] = {}; static_bg_entries[0].binding = 0; static_bg_entries[0].textureView = input_view; static_bg_entries[1].binding = 1; static_bg_entries[1].textureView = input_view; static_bg_entries[2].binding = 2; static_bg_entries[2].textureView = input_view; static_bg_entries[3].binding = 3; static_bg_entries[3].textureView = depth_view; // Depth from alpha channel (matches training) static_bg_entries[4].binding = 4; static_bg_entries[4].textureView = static_features_view; static_bg_entries[5].binding = 5; static_bg_entries[5].buffer = static_params_buffer; static_bg_entries[5].size = sizeof(CNNv2StaticFeatureParams); WGPUBindGroupDescriptor static_bg_desc = {}; static_bg_desc.layout = static_bgl; static_bg_desc.entryCount = 6; static_bg_desc.entries = static_bg_entries; WGPUBindGroup static_bg = wgpuDeviceCreateBindGroup(device, &static_bg_desc); wgpuBindGroupLayoutRelease(static_bgl); // Create layer compute pipeline WGPUShaderSourceWGSL layer_wgsl = {}; layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL; layer_wgsl.code = str_view(layer_shader); WGPUShaderModuleDescriptor layer_module_desc = {}; layer_module_desc.nextInChain = &layer_wgsl.chain; WGPUShaderModule layer_module = wgpuDeviceCreateShaderModule(device, &layer_module_desc); // Layer bind group layout: // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params, // 5=original WGPUBindGroupLayoutEntry layer_bgl_entries[6] = {}; layer_bgl_entries[0].binding = 0; layer_bgl_entries[0].visibility = WGPUShaderStage_Compute; layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint; layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D; layer_bgl_entries[1].binding = 1; layer_bgl_entries[1].visibility = WGPUShaderStage_Compute; layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint; layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D; layer_bgl_entries[2].binding = 2; layer_bgl_entries[2].visibility = WGPUShaderStage_Compute; layer_bgl_entries[2].storageTexture.access = WGPUStorageTextureAccess_WriteOnly; layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint; layer_bgl_entries[2].storageTexture.viewDimension = WGPUTextureViewDimension_2D; layer_bgl_entries[3].binding = 3; layer_bgl_entries[3].visibility = WGPUShaderStage_Compute; layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; layer_bgl_entries[4].binding = 4; layer_bgl_entries[4].visibility = WGPUShaderStage_Compute; layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform; layer_bgl_entries[4].buffer.minBindingSize = sizeof(CNNv2LayerParams); layer_bgl_entries[5].binding = 5; layer_bgl_entries[5].visibility = WGPUShaderStage_Compute; layer_bgl_entries[5].texture.sampleType = WGPUTextureSampleType_Float; layer_bgl_entries[5].texture.viewDimension = WGPUTextureViewDimension_2D; WGPUBindGroupLayoutDescriptor layer_bgl_desc = {}; layer_bgl_desc.entryCount = 6; layer_bgl_desc.entries = layer_bgl_entries; WGPUBindGroupLayout layer_bgl = wgpuDeviceCreateBindGroupLayout(device, &layer_bgl_desc); WGPUPipelineLayoutDescriptor layer_pl_desc = {}; layer_pl_desc.bindGroupLayoutCount = 1; layer_pl_desc.bindGroupLayouts = &layer_bgl; WGPUPipelineLayout layer_pl = wgpuDeviceCreatePipelineLayout(device, &layer_pl_desc); WGPUComputePipelineDescriptor layer_pipeline_desc = {}; layer_pipeline_desc.compute.module = layer_module; layer_pipeline_desc.compute.entryPoint = str_view("main"); layer_pipeline_desc.layout = layer_pl; WGPUComputePipeline layer_pipeline = wgpuDeviceCreateComputePipeline(device, &layer_pipeline_desc); wgpuShaderModuleRelease(layer_module); wgpuPipelineLayoutRelease(layer_pl); // Create layer params buffers std::vector layer_params_buffers; for (size_t i = 0; i < layer_info.size(); ++i) { WGPUBufferDescriptor params_desc = {}; params_desc.size = sizeof(CNNv2LayerParams); params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; params_desc.mappedAtCreation = false; WGPUBuffer buf = wgpuDeviceCreateBuffer(device, ¶ms_desc); layer_params_buffers.push_back(buf); } // Execute compute passes WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); // Pass 1: Static features printf("Computing static features...\n"); WGPUComputePassEncoder static_pass = wgpuCommandEncoderBeginComputePass(encoder, nullptr); wgpuComputePassEncoderSetPipeline(static_pass, static_pipeline); wgpuComputePassEncoderSetBindGroup(static_pass, 0, static_bg, 0, nullptr); uint32_t workgroups_x = (width + 7) / 8; uint32_t workgroups_y = (height + 7) / 8; wgpuComputePassEncoderDispatchWorkgroups(static_pass, workgroups_x, workgroups_y, 1); wgpuComputePassEncoderEnd(static_pass); wgpuComputePassEncoderRelease(static_pass); // Save static features if requested if (args.save_intermediates) { // Submit and wait for static features to complete WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &cmd); wgpuCommandBufferRelease(cmd); wgpuDevicePoll(device, true, nullptr); // Create new encoder for layers encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); char layer_path[512]; snprintf(layer_path, sizeof(layer_path), "%s/static_features.png", args.save_intermediates); printf("Saving static features to '%s'...\n", layer_path); // Read back RGBA32Uint and create 8-channel grayscale composite // Static features has 8 channels (packed as 4×u32), create 8x wide composite std::vector bgra = readback_rgba32uint_to_bgra8( device, queue, static_features_tex, width, height); if (!bgra.empty()) { // Static features: 8 f16 values packed in 4×u32 // For now, just show first 4 channels (like layers) // TODO: Show all 8 channels in 8x wide composite std::vector composite = readback_rgba32uint_to_composite( device, queue, static_features_tex, width, height); if (!composite.empty()) { int composite_width = width * 4; if (!stbi_write_png(layer_path, composite_width, height, 1, composite.data(), composite_width)) { fprintf(stderr, "Error: failed to write static features PNG\n"); } } } } // Pass 2-N: CNN layers for (size_t i = 0; i < layer_info.size(); ++i) { const CNNv2LayerInfo& info = layer_info[i]; printf("Processing layer %zu/%zu (%ux%u, %u→%u channels)...\n", i + 1, layer_info.size(), info.kernel_size, info.kernel_size, info.in_channels, info.out_channels); // Update layer params CNNv2LayerParams params; params.kernel_size = info.kernel_size; params.in_channels = info.in_channels; params.out_channels = info.out_channels; params.weight_offset = info.weight_offset; params.is_output_layer = (i == layer_info.size() - 1) ? 1 : 0; params.blend_amount = args.blend; params.is_layer_0 = (i == 0) ? 1 : 0; wgpuQueueWriteBuffer(queue, layer_params_buffers[i], 0, ¶ms, sizeof(params)); // Create bind group for this layer WGPUBindGroupEntry layer_bg_entries[6] = {}; layer_bg_entries[0].binding = 0; layer_bg_entries[0].textureView = static_features_view; layer_bg_entries[1].binding = 1; layer_bg_entries[1].textureView = (i == 0) ? static_features_view : layer_views[i % 2]; layer_bg_entries[2].binding = 2; layer_bg_entries[2].textureView = layer_views[(i + 1) % 2]; layer_bg_entries[3].binding = 3; layer_bg_entries[3].buffer = weights_buffer; layer_bg_entries[3].size = weights_only_size; layer_bg_entries[4].binding = 4; layer_bg_entries[4].buffer = layer_params_buffers[i]; layer_bg_entries[4].size = sizeof(CNNv2LayerParams); layer_bg_entries[5].binding = 5; layer_bg_entries[5].textureView = input_view; WGPUBindGroupDescriptor layer_bg_desc = {}; layer_bg_desc.layout = layer_bgl; layer_bg_desc.entryCount = 6; layer_bg_desc.entries = layer_bg_entries; WGPUBindGroup layer_bg = wgpuDeviceCreateBindGroup(device, &layer_bg_desc); WGPUComputePassEncoder layer_pass = wgpuCommandEncoderBeginComputePass(encoder, nullptr); wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline); wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bg, 0, nullptr); wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x, workgroups_y, 1); wgpuComputePassEncoderEnd(layer_pass); wgpuComputePassEncoderRelease(layer_pass); wgpuBindGroupRelease(layer_bg); // Save intermediate layer if requested if (args.save_intermediates) { // Submit and wait for layer to complete WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &cmd); wgpuCommandBufferRelease(cmd); wgpuDevicePoll(device, true, nullptr); // Create new encoder for next layer encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); char layer_path[512]; snprintf(layer_path, sizeof(layer_path), "%s/layer_%zu.png", args.save_intermediates, i); printf("Saving intermediate layer %zu to '%s'...\n", i, layer_path); // Read back RGBA32Uint and create 4-channel grayscale composite WGPUTexture output_tex = layer_textures[(i + 1) % 2]; std::vector composite = readback_rgba32uint_to_composite( device, queue, output_tex, width, height); if (!composite.empty()) { int composite_width = width * 4; if (!stbi_write_png(layer_path, composite_width, height, 1, composite.data(), composite_width)) { fprintf(stderr, "Error: failed to write layer PNG\n"); } } } } WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &commands); wgpuCommandBufferRelease(commands); wgpuCommandEncoderRelease(encoder); wgpuDevicePoll(device, true, nullptr); // Create layer composite if intermediates were saved if (args.save_intermediates) { save_layer_composite(args.save_intermediates, width, height, layer_info.size()); } // Readback final result (from last layer's output texture) printf("Reading pixels from GPU...\n"); size_t final_layer_idx = (layer_info.size()) % 2; std::vector pixels = readback_rgba32uint_to_bgra8( device, queue, layer_textures[final_layer_idx], width, height); if (pixels.empty()) { fprintf(stderr, "Error: GPU readback failed\n"); for (auto buf : layer_params_buffers) wgpuBufferRelease(buf); wgpuComputePipelineRelease(layer_pipeline); wgpuBindGroupLayoutRelease(layer_bgl); wgpuBindGroupRelease(static_bg); wgpuComputePipelineRelease(static_pipeline); wgpuBufferRelease(static_params_buffer); wgpuTextureViewRelease(static_features_view); wgpuTextureRelease(static_features_tex); wgpuTextureViewRelease(depth_view); wgpuTextureRelease(depth_texture); wgpuTextureViewRelease(layer_views[0]); wgpuTextureViewRelease(layer_views[1]); wgpuTextureRelease(layer_textures[0]); wgpuTextureRelease(layer_textures[1]); wgpuBufferRelease(weights_buffer); wgpuTextureViewRelease(input_view); return false; } // Debug hex dump if (args.debug_hex) { printf("First 8 pixels (BGRA hex):\n"); for (int i = 0; i < 8 && i < width * height; ++i) { const uint8_t b = pixels[i * 4 + 0]; const uint8_t g = pixels[i * 4 + 1]; const uint8_t r = pixels[i * 4 + 2]; const uint8_t a = pixels[i * 4 + 3]; printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a); } } // Save output bool success; if (args.output_png) { printf("Saving PNG to '%s'...\n", args.output_path); success = save_png(args.output_path, pixels, width, height); } else { printf("Saving PPM to '%s'...\n", args.output_path); success = save_ppm(args.output_path, pixels, width, height); } if (success) { printf("Done! Output saved to '%s'\n", args.output_path); } // Cleanup for (auto buf : layer_params_buffers) wgpuBufferRelease(buf); wgpuComputePipelineRelease(layer_pipeline); wgpuBindGroupLayoutRelease(layer_bgl); wgpuBindGroupRelease(static_bg); wgpuComputePipelineRelease(static_pipeline); wgpuBufferRelease(static_params_buffer); wgpuTextureViewRelease(static_features_view); wgpuTextureRelease(static_features_tex); wgpuTextureViewRelease(layer_views[0]); wgpuTextureViewRelease(layer_views[1]); wgpuTextureRelease(layer_textures[0]); wgpuTextureRelease(layer_textures[1]); wgpuBufferRelease(weights_buffer); wgpuTextureViewRelease(input_view); return success; } int main(int argc, char** argv) { // Parse arguments Args args; if (!parse_args(argc, argv, &args)) { print_usage(argv[0]); return 1; } // Initialize shader composer (required for #include resolution) InitShaderComposer(); // Initialize WebGPU WebGPUTestFixture fixture; if (!fixture.init()) { fprintf(stderr, "Error: GPU unavailable\n"); return 1; } GpuContext ctx = fixture.ctx(); WGPUDevice device = ctx.device; WGPUQueue queue = ctx.queue; WGPUInstance instance = fixture.instance(); // Load input texture int width, height; WGPUTexture input_texture = load_texture(device, queue, args.input_path, &width, &height); if (!input_texture) { SamplerCache::Get().clear(); fixture.shutdown(); return 1; } printf("Loaded %dx%d image from '%s'\n", width, height, args.input_path); // Branch based on CNN version if (args.cnn_version == 2) { bool success = process_cnn_v2(device, queue, instance, input_texture, width, height, args); wgpuTextureRelease(input_texture); SamplerCache::Get().clear(); fixture.shutdown(); return success ? 0 : 1; } // CNN v1 processing below printf("Using CNN v1 (render pipeline architecture)\n"); // Create input texture view const WGPUTextureViewDescriptor view_desc = { .format = WGPUTextureFormat_BGRA8Unorm, .dimension = WGPUTextureViewDimension_2D, .baseMipLevel = 0, .mipLevelCount = 1, .baseArrayLayer = 0, .arrayLayerCount = 1, }; WGPUTextureView input_view = wgpuTextureCreateView(input_texture, &view_desc); WGPUTextureView original_view = input_view; // Keep reference to original // Create CNN pipelines (different formats for intermediate vs final) WGPURenderPipeline pipeline_intermediate = create_cnn_pipeline(device, WGPUTextureFormat_RGBA16Float, false); WGPURenderPipeline pipeline_final = create_cnn_pipeline(device, WGPUTextureFormat_BGRA8Unorm, true); if (!pipeline_intermediate || !pipeline_final) { fprintf(stderr, "Error: failed to create CNN pipelines\n"); if (pipeline_intermediate) wgpuRenderPipelineRelease(pipeline_intermediate); if (pipeline_final) wgpuRenderPipelineRelease(pipeline_final); wgpuTextureViewRelease(input_view); wgpuTextureRelease(input_texture); SamplerCache::Get().clear(); fixture.shutdown(); return 1; } // Get bind group layout from intermediate pipeline (same for both) WGPUBindGroupLayout bgl = wgpuRenderPipelineGetBindGroupLayout(pipeline_intermediate, 0); // Create uniform buffers const WGPUBufferDescriptor common_uniform_desc = { .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, .size = sizeof(CommonPostProcessUniforms), }; WGPUBuffer common_uniform_buffer = wgpuDeviceCreateBuffer(device, &common_uniform_desc); const WGPUBufferDescriptor layer_params_desc = { .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, .size = sizeof(CNNLayerParams), }; WGPUBuffer layer_params_buffer = wgpuDeviceCreateBuffer(device, &layer_params_desc); // Create intermediate textures for ping-pong (2 textures) // Use RGBA16Float to preserve [-1,1] range from tanh activation const WGPUTextureDescriptor intermediate_desc = { .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_CopySrc, .dimension = WGPUTextureDimension_2D, .size = {static_cast(width), static_cast(height), 1}, .format = WGPUTextureFormat_RGBA16Float, .mipLevelCount = 1, .sampleCount = 1, }; WGPUTexture intermediate_textures[2] = { wgpuDeviceCreateTexture(device, &intermediate_desc), wgpuDeviceCreateTexture(device, &intermediate_desc), }; // Create views for intermediate textures (RGBA16Float) const WGPUTextureViewDescriptor intermediate_view_desc = { .format = WGPUTextureFormat_RGBA16Float, .dimension = WGPUTextureViewDimension_2D, .baseMipLevel = 0, .mipLevelCount = 1, .baseArrayLayer = 0, .arrayLayerCount = 1, }; WGPUTextureView intermediate_views[2] = { wgpuTextureCreateView(intermediate_textures[0], &intermediate_view_desc), wgpuTextureCreateView(intermediate_textures[1], &intermediate_view_desc), }; // Get sampler WGPUSampler sampler = SamplerCache::Get().get_or_create(device, SamplerCache::clamp()); // Multi-layer processing const int NUM_LAYERS = args.num_layers; int dst_idx = 0; // Index of texture to render to // First layer reads from input, subsequent layers read from previous output WGPUTextureView current_input = input_view; for (int layer = 0; layer < NUM_LAYERS; ++layer) { printf("Processing layer %d/%d...\n", layer + 1, NUM_LAYERS); // Update uniforms CommonPostProcessUniforms common_u = { .resolution = {static_cast(width), static_cast(height)}, .aspect_ratio = static_cast(width) / static_cast(height), .time = 0.0f, .beat_time = 0.0f, .beat_phase = 0.0f, .audio_intensity = 0.0f, ._pad = 0.0f, }; wgpuQueueWriteBuffer(queue, common_uniform_buffer, 0, &common_u, sizeof(common_u)); CNNLayerParams layer_params = { .layer_index = layer, .blend_amount = (layer == NUM_LAYERS - 1) ? args.blend : 1.0f, // Only final layer ._pad = {0.0f, 0.0f}, }; wgpuQueueWriteBuffer(queue, layer_params_buffer, 0, &layer_params, sizeof(layer_params)); // Build bind group WGPUBindGroup bind_group = BindGroupBuilder() .sampler(0, sampler) .texture(1, current_input) .buffer(2, common_uniform_buffer, sizeof(CommonPostProcessUniforms)) .buffer(3, layer_params_buffer, sizeof(CNNLayerParams)) .texture(4, original_view) .build(device, bgl); // Render to appropriate output texture with correct pipeline bool is_final = (layer == NUM_LAYERS - 1); if (is_final) { // Final layer: use OffscreenRenderTarget (known working readback) OffscreenRenderTarget rt(instance, device, width, height); WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); WGPURenderPassEncoder pass = begin_render_pass(encoder, rt.view()); wgpuRenderPassEncoderSetPipeline(pass, pipeline_final); wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr); wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); wgpuRenderPassEncoderEnd(pass); WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &commands); wgpuDevicePoll(device, true, nullptr); wgpuCommandBufferRelease(commands); wgpuRenderPassEncoderRelease(pass); wgpuCommandEncoderRelease(encoder); wgpuBindGroupRelease(bind_group); // Read pixels immediately printf("Reading pixels from GPU...\n"); std::vector pixels = rt.read_pixels(); // Debug: print first 8 pixels as hex if (args.debug_hex && !pixels.empty()) { printf("First 8 pixels (BGRA hex):\n"); for (int i = 0; i < 8 && i < width * height; ++i) { const uint8_t b = pixels[i * 4 + 0]; const uint8_t g = pixels[i * 4 + 1]; const uint8_t r = pixels[i * 4 + 2]; const uint8_t a = pixels[i * 4 + 3]; printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a); } } if (pixels.empty()) { fprintf(stderr, "Error: GPU readback failed\n"); wgpuTextureViewRelease(intermediate_views[0]); wgpuTextureViewRelease(intermediate_views[1]); wgpuTextureRelease(intermediate_textures[0]); wgpuTextureRelease(intermediate_textures[1]); wgpuTextureViewRelease(input_view); wgpuTextureRelease(input_texture); wgpuBufferRelease(layer_params_buffer); wgpuBufferRelease(common_uniform_buffer); wgpuBindGroupLayoutRelease(bgl); wgpuRenderPipelineRelease(pipeline_final); wgpuRenderPipelineRelease(pipeline_intermediate); SamplerCache::Get().clear(); fixture.shutdown(); return 1; } // Save output bool success; if (args.output_png) { printf("Saving PNG to '%s'...\n", args.output_path); success = save_png(args.output_path, pixels, width, height); } else { printf("Saving PPM to '%s'...\n", args.output_path); success = save_ppm(args.output_path, pixels, width, height); } if (!success) { wgpuTextureViewRelease(intermediate_views[0]); wgpuTextureViewRelease(intermediate_views[1]); wgpuTextureRelease(intermediate_textures[0]); wgpuTextureRelease(intermediate_textures[1]); wgpuTextureViewRelease(input_view); wgpuTextureRelease(input_texture); wgpuBufferRelease(layer_params_buffer); wgpuBufferRelease(common_uniform_buffer); wgpuBindGroupLayoutRelease(bgl); wgpuRenderPipelineRelease(pipeline_final); wgpuRenderPipelineRelease(pipeline_intermediate); SamplerCache::Get().clear(); fixture.shutdown(); return 1; } printf("Done! Output saved to '%s'\n", args.output_path); break; // Exit loop after final layer } else { // Intermediate layers: render to ping-pong textures WGPUTextureView output_view = intermediate_views[dst_idx]; WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); WGPURenderPassEncoder pass = begin_render_pass(encoder, output_view); wgpuRenderPassEncoderSetPipeline(pass, pipeline_intermediate); wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr); wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); wgpuRenderPassEncoderEnd(pass); WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); wgpuQueueSubmit(queue, 1, &commands); wgpuDevicePoll(device, true, nullptr); wgpuCommandBufferRelease(commands); wgpuRenderPassEncoderRelease(pass); wgpuCommandEncoderRelease(encoder); wgpuBindGroupRelease(bind_group); // Save intermediate layer if requested if (args.save_intermediates) { char layer_path[512]; snprintf(layer_path, sizeof(layer_path), "%s/layer_%d.png", args.save_intermediates, layer); printf("Saving intermediate layer %d to '%s'...\n", layer, layer_path); // Readback RGBA16Float texture std::vector pixels = texture_readback_fp16_to_u8( device, queue, intermediate_textures[dst_idx], width, height); // Debug: print first 8 pixels as hex if (args.debug_hex && !pixels.empty()) { printf("Layer %d first 8 pixels (BGRA hex):\n", layer); for (int i = 0; i < 8 && i < width * height; ++i) { const uint8_t b = pixels[i * 4 + 0]; const uint8_t g = pixels[i * 4 + 1]; const uint8_t r = pixels[i * 4 + 2]; const uint8_t a = pixels[i * 4 + 3]; printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a); } } if (!pixels.empty()) { save_png(layer_path, pixels, width, height); } else { fprintf(stderr, "Warning: failed to read intermediate layer %d\n", layer); } } } // Update for next layer: output becomes input if (layer < NUM_LAYERS - 1) { // Use this layer's output as next layer's input current_input = intermediate_views[dst_idx]; dst_idx = 1 - dst_idx; // Flip ping-pong for next render } } // Wait for all GPU work to complete before cleanup wgpuDevicePoll(device, true, nullptr); // Cleanup wgpuTextureViewRelease(intermediate_views[0]); wgpuTextureViewRelease(intermediate_views[1]); wgpuTextureRelease(intermediate_textures[0]); wgpuTextureRelease(intermediate_textures[1]); wgpuBufferRelease(layer_params_buffer); wgpuBufferRelease(common_uniform_buffer); wgpuBindGroupLayoutRelease(bgl); wgpuRenderPipelineRelease(pipeline_intermediate); wgpuRenderPipelineRelease(pipeline_final); wgpuTextureViewRelease(input_view); wgpuTextureRelease(input_texture); SamplerCache::Get().clear(); fixture.shutdown(); return 0; }