diff options
Diffstat (limited to 'tools/cnn_test.cc')
| -rw-r--r-- | tools/cnn_test.cc | 1936 |
1 files changed, 509 insertions, 1427 deletions
diff --git a/tools/cnn_test.cc b/tools/cnn_test.cc index e5e2d26..beeef8f 100644 --- a/tools/cnn_test.cc +++ b/tools/cnn_test.cc @@ -1,21 +1,16 @@ -// CNN shader testing tool for offline validation -// Tests trained CNN shaders on input PNG with GPU readback +// CNN v3 shader testing tool — offline WGSL inference for Python parity checks. +// Loads an input PNG (or sample directory), packs 20-channel features, runs the +// CNNv3Effect (5 compute passes), and saves the RGBA16Float output as PNG. #if defined(STRIP_ALL) #error "cnn_test requires STRIP_ALL=OFF (tool builds only)" #endif -#include "effects/shaders.h" +#include "cnn_v3_effect.h" #include "generated/assets.h" -#include "gpu/bind_group_builder.h" #include "gpu/gpu.h" -#include "gpu/pipeline_builder.h" -#include "gpu/post_process_helper.h" -#include "gpu/sampler_cache.h" +#include "gpu/sequence.h" #include "gpu/shader_composer.h" -#include "gpu/texture_readback.h" -#include "platform/platform.h" -#include "tests/common/offscreen_render_target.h" #include "tests/common/webgpu_test_fixture.h" #include "util/asset_manager.h" #include "util/mini_math.h" @@ -27,1551 +22,638 @@ #include <cstdio> #include <cstdlib> #include <cstring> +#include <string> #include <vector> -// CNN v1 structures -struct CNNv1LayerParams { - int layer_index; - float blend_amount; - float _pad[2]; -}; -static_assert(sizeof(CNNv1LayerParams) == 16); +// --------------------------------------------------------------------------- +// F16 / pack helpers (match WGSL pack2x16float / pack4x8unorm) +// --------------------------------------------------------------------------- -// Helper to get asset string or empty string -static const char* SafeGetAsset(AssetId id) { - const uint8_t* data = GetAsset(id); - return data ? (const char*)data : ""; +static uint16_t f32_to_f16(float f) { + uint32_t b; + memcpy(&b, &f, 4); + uint32_t sign = (b >> 16) & 0x8000u; + int32_t exp = (int32_t)((b >> 23) & 0xFFu) - 127 + 15; + uint32_t mant = b & 0x7FFFFFu; + if (exp <= 0) return (uint16_t)sign; + if (exp >= 31) return (uint16_t)(sign | 0x7C00u); + return (uint16_t)(sign | ((uint32_t)exp << 10) | (mant >> 13)); } -// Command-line arguments -struct Args { - const char* input_path = nullptr; - const char* output_path = nullptr; - float blend = 1.0f; - bool output_png = true; // Default to PNG - const char* save_intermediates = nullptr; - int num_layers = 3; // Default to 3 layers - bool debug_hex = false; // Print first 8 pixels as hex - int cnn_version = 1; // 1=CNNEffect, 2=CNNv2Effect - const char* weights_path = nullptr; // Optional .bin weights file - bool cnn_version_explicit = - false; // Track if --cnn-version was explicitly set -}; - -// Parse command-line arguments -static bool parse_args(int argc, char** argv, Args* args) { - if (argc < 3) { - return false; - } - - args->input_path = argv[1]; - args->output_path = argv[2]; - - for (int i = 3; i < argc; ++i) { - if (strcmp(argv[i], "--blend") == 0 && i + 1 < argc) { - args->blend = atof(argv[++i]); - if (args->blend < 0.0f || args->blend > 1.0f) { - fprintf(stderr, "Error: blend must be in range [0.0, 1.0]\n"); - return false; - } - } else if (strcmp(argv[i], "--format") == 0 && i + 1 < argc) { - ++i; - if (strcmp(argv[i], "ppm") == 0) { - args->output_png = false; - } else if (strcmp(argv[i], "png") == 0) { - args->output_png = true; - } else { - fprintf(stderr, "Error: unknown format '%s' (use 'png' or 'ppm')\n", - argv[i]); - return false; - } - } else if (strcmp(argv[i], "--save-intermediates") == 0 && i + 1 < argc) { - args->save_intermediates = argv[++i]; - } else if (strcmp(argv[i], "--layers") == 0 && i + 1 < argc) { - args->num_layers = atoi(argv[++i]); - if (args->num_layers < 1 || args->num_layers > 10) { - fprintf(stderr, "Error: layers must be in range [1, 10]\n"); - return false; - } - } else if (strcmp(argv[i], "--debug-hex") == 0) { - args->debug_hex = true; - } else if (strcmp(argv[i], "--cnn-version") == 0 && i + 1 < argc) { - args->cnn_version = atoi(argv[++i]); - args->cnn_version_explicit = true; - if (args->cnn_version < 1 || args->cnn_version > 2) { - fprintf(stderr, "Error: cnn-version must be 1 or 2\n"); - return false; - } - } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) { - args->weights_path = argv[++i]; - } else if (strcmp(argv[i], "--help") == 0) { - return false; - } else { - fprintf(stderr, "Error: unknown option '%s'\n", argv[i]); - return false; - } - } - - // Force CNN v2 when --weights is specified - if (args->weights_path) { - if (args->cnn_version_explicit && args->cnn_version != 2) { - fprintf(stderr, - "WARNING: --cnn-version %d ignored (--weights forces CNN v2)\n", - args->cnn_version); - } - args->cnn_version = 2; - - // Warn if --layers was specified (binary file config takes precedence) - if (args->num_layers != 3) { // 3 is the default - fprintf(stderr, - "WARNING: --layers %d ignored (--weights loads layer config from " - ".bin)\n", - args->num_layers); - } - } - - return true; +// Low 16 bits = a, high 16 bits = b (matches WGSL pack2x16float(vec2f(a,b))) +static uint32_t pack2x16f(float a, float b) { + return (uint32_t)f32_to_f16(a) | ((uint32_t)f32_to_f16(b) << 16); } -// Print usage -static void print_usage(const char* prog) { - fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog); - fprintf(stderr, "\nOPTIONS:\n"); - fprintf(stderr, - " --blend F Final blend amount (0.0-1.0, default: " - "1.0)\n"); - fprintf(stderr, " --format ppm|png Output format (default: png)\n"); - fprintf(stderr, - " --layers N Number of CNN layers (1-10, default: 3, " - "ignored with --weights)\n"); - fprintf(stderr, - " --save-intermediates DIR Save intermediate layers to directory\n"); - fprintf(stderr, - " --debug-hex Print first 8 pixels as hex (debug)\n"); - fprintf(stderr, - " --cnn-version N CNN version: 1 (default) or 2 (ignored " - "with --weights)\n"); - fprintf(stderr, - " --weights PATH Load weights from .bin (forces CNN v2, " - "overrides layer config)\n"); - fprintf(stderr, " --help Show this help\n"); +// RGBA as u8 packed into u32 (matches WGSL pack4x8unorm) +static uint32_t pack4x8u(float a, float b, float c, float d) { + auto u8 = [](float v) -> uint32_t { + int i = (int)(v * 255.0f + 0.5f); + if (i < 0) i = 0; + if (i > 255) i = 255; + return (uint32_t)i; + }; + return u8(a) | (u8(b) << 8) | (u8(c) << 16) | (u8(d) << 24); } -// Load PNG and upload to GPU texture -static WGPUTexture load_texture(WGPUDevice device, WGPUQueue queue, - const char* path, int* out_width, - int* out_height) { - int width, height, channels; - uint8_t* data = stbi_load(path, &width, &height, &channels, 4); - if (!data) { - fprintf(stderr, "Error: failed to load image '%s'\n", path); - return nullptr; - } - - *out_width = width; - *out_height = height; +// --------------------------------------------------------------------------- +// Oct-decode [0,1] → unit normal (matches Python cnn_v3_utils.oct_decode) +// --------------------------------------------------------------------------- - // Create texture - const WGPUTextureDescriptor texture_desc = { - .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst | - WGPUTextureUsage_RenderAttachment, - .dimension = WGPUTextureDimension_2D, - .size = {(uint32_t)(width), (uint32_t)(height), 1}, - .format = WGPUTextureFormat_BGRA8Unorm, - .mipLevelCount = 1, - .sampleCount = 1, - }; - WGPUTexture texture = wgpuDeviceCreateTexture(device, &texture_desc); - if (!texture) { - fprintf(stderr, "Error: failed to create texture\n"); - stbi_image_free(data); - return nullptr; +static void oct_decode_01(float nx01, float ny01, + float* out_x, float* out_y, float* out_z) { + float fx = nx01 * 2.0f - 1.0f; + float fy = ny01 * 2.0f - 1.0f; + float fz = 1.0f - fabsf(fx) - fabsf(fy); + if (fz < 0.0f) { + float sx = fx >= 0.0f ? 1.0f : -1.0f; + float sy = fy >= 0.0f ? 1.0f : -1.0f; + fx = (1.0f - fabsf(fy)) * sx; + fy = (1.0f - fabsf(fx)) * sy; } + float len = sqrtf(fx*fx + fy*fy + fz*fz); + if (len < 1e-8f) len = 1e-8f; + *out_x = fx / len; + *out_y = fy / len; + *out_z = fz / len; +} - // Convert RGBA → BGRA - std::vector<uint8_t> bgra_data(width * height * 4); - for (int i = 0; i < width * height; ++i) { - bgra_data[i * 4 + 0] = data[i * 4 + 2]; // B - bgra_data[i * 4 + 1] = data[i * 4 + 1]; // G - bgra_data[i * 4 + 2] = data[i * 4 + 0]; // R - bgra_data[i * 4 + 3] = data[i * 4 + 3]; // A - } +// --------------------------------------------------------------------------- +// Mip helpers — matching Python pyrdown + nearest-upsample +// --------------------------------------------------------------------------- - // Upload to GPU - const WGPUTexelCopyTextureInfo dst = {.texture = texture, .mipLevel = 0}; - const WGPUTexelCopyBufferLayout layout = { - .bytesPerRow = (uint32_t)(width * 4), .rowsPerImage = (uint32_t)(height)}; - const WGPUExtent3D size = {(uint32_t)(width), (uint32_t)(height), 1}; - wgpuQueueWriteTexture(queue, &dst, bgra_data.data(), bgra_data.size(), - &layout, &size); +// Compute mip1 and mip2 for each pixel using the Python convention: +// mip1_small[y2][x2] = avg(rgb[2y2..2y2+1][2x2..2x2+1]) (half-res) +// mip2_small[y4][x4] = avg(mip1[2y4..2y4+1][2x4..2x4+1]) (quarter-res) +// Nearest upsample: mip1[y][x] = mip1_small[y/2][x/2], etc. +// Output: mip1_out and mip2_out are (H*W*3) float arrays in row-major order. - stbi_image_free(data); - return texture; -} +static void compute_mips(const float* rgb, int w, int h, + std::vector<float>& mip1_out, + std::vector<float>& mip2_out) { + const int w2 = w / 2, h2 = h / 2; + const int w4 = w / 4, h4 = h / 4; -// Load PNG alpha channel as depth texture (or 1.0 if no alpha) -static WGPUTexture load_depth_from_alpha(WGPUDevice device, WGPUQueue queue, - const char* path, int width, - int height) { - int w, h, channels; - uint8_t* data = stbi_load(path, &w, &h, &channels, 4); - if (!data || w != width || h != height) { - fprintf(stderr, "Error: failed to load depth from '%s'\n", path); - if (data) - stbi_image_free(data); - return nullptr; + std::vector<float> m1(w2 * h2 * 3); + for (int y2 = 0; y2 < h2; ++y2) { + for (int x2 = 0; x2 < w2; ++x2) { + for (int c = 0; c < 3; ++c) { + int y0 = y2 * 2, x0 = x2 * 2; + float v = rgb[(y0 * w + x0 ) * 3 + c] + + rgb[(y0 * w + x0+1) * 3 + c] + + rgb[((y0+1) * w + x0 ) * 3 + c] + + rgb[((y0+1) * w + x0+1) * 3 + c]; + m1[(y2 * w2 + x2) * 3 + c] = v * 0.25f; + } + } } - // Extract alpha channel (or use 1.0 if original was RGB) - std::vector<float> depth_data(width * height); - bool has_alpha = (channels == 4); - for (int i = 0; i < width * height; ++i) { - // Alpha is in data[i*4+3] (0-255), convert to float [0, 1] - // If no alpha channel, default to 1.0 (far plane) - depth_data[i] = has_alpha ? (data[i * 4 + 3] / 255.0f) : 1.0f; + std::vector<float> m2(w4 * h4 * 3); + for (int y4 = 0; y4 < h4; ++y4) { + for (int x4 = 0; x4 < w4; ++x4) { + for (int c = 0; c < 3; ++c) { + int y0 = y4 * 2, x0 = x4 * 2; + float v = m1[(y0 * w2 + x0 ) * 3 + c] + + m1[(y0 * w2 + x0+1) * 3 + c] + + m1[((y0+1) * w2 + x0 ) * 3 + c] + + m1[((y0+1) * w2 + x0+1) * 3 + c]; + m2[(y4 * w4 + x4) * 3 + c] = v * 0.25f; + } + } } - stbi_image_free(data); - // Create R32Float depth texture - const WGPUTextureDescriptor depth_desc = { - .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst, - .dimension = WGPUTextureDimension_2D, - .size = {(uint32_t)(width), (uint32_t)(height), 1}, - .format = WGPUTextureFormat_R32Float, - .mipLevelCount = 1, - .sampleCount = 1, - }; - WGPUTexture depth_texture = wgpuDeviceCreateTexture(device, &depth_desc); - if (!depth_texture) { - fprintf(stderr, "Error: failed to create depth texture\n"); - return nullptr; + // Nearest upsample to full-res + mip1_out.resize(w * h * 3); + mip2_out.resize(w * h * 3); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int i = (y * w + x) * 3; + int i1 = ((y/2) * w2 + (x/2)) * 3; + int i2 = ((y/4) * w4 + (x/4)) * 3; + mip1_out[i ] = (y/2 < h2 && x/2 < w2) ? m1[i1 ] : 0.0f; + mip1_out[i+1] = (y/2 < h2 && x/2 < w2) ? m1[i1+1] : 0.0f; + mip1_out[i+2] = (y/2 < h2 && x/2 < w2) ? m1[i1+2] : 0.0f; + mip2_out[i ] = (y/4 < h4 && x/4 < w4) ? m2[i2 ] : 0.0f; + mip2_out[i+1] = (y/4 < h4 && x/4 < w4) ? m2[i2+1] : 0.0f; + mip2_out[i+2] = (y/4 < h4 && x/4 < w4) ? m2[i2+2] : 0.0f; + } } - - // Write depth data - const WGPUTexelCopyTextureInfo dst = {.texture = depth_texture, - .mipLevel = 0}; - const WGPUTexelCopyBufferLayout layout = { - .bytesPerRow = (uint32_t)(width * sizeof(float)), - .rowsPerImage = (uint32_t)(height)}; - const WGPUExtent3D size = {(uint32_t)(width), (uint32_t)(height), 1}; - wgpuQueueWriteTexture(queue, &dst, depth_data.data(), - depth_data.size() * sizeof(float), &layout, &size); - - printf("Loaded depth from alpha: %dx%d (%s alpha)\n", width, height, - has_alpha ? "has" : "no"); - - return depth_texture; } -// Create CNN render pipeline (5 bindings) -// Takes both intermediate format (RGBA16Float) and final format (BGRA8Unorm) -static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device, - WGPUTextureFormat format, - bool is_final_layer) { - const char* shader_code = SafeGetAsset(AssetId::ASSET_SHADER_CNN_LAYER); - - // Debug: check if shader loaded - if (!shader_code || shader_code[0] == '\0') { - fprintf(stderr, "ERROR: CNN shader asset not loaded!\n"); - return nullptr; - } - printf("Loaded CNN shader: %zu bytes\n", strlen(shader_code)); - - WGPUBindGroupLayout bgl = - BindGroupLayoutBuilder() - .sampler(0, WGPUShaderStage_Fragment) - .texture(1, WGPUShaderStage_Fragment) - .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment) - .uniform(3, WGPUShaderStage_Fragment) - .texture(4, WGPUShaderStage_Fragment) // Original input - .build(device); +// --------------------------------------------------------------------------- +// Feature packing: RGB float arrays → feat_tex0 / feat_tex1 (rgba32uint) +// +// feat_tex0 (4 u32, f16 pairs — matches load_feat in cnn_v3_enc0.wgsl): +// [0] albedo.r | albedo.g +// [1] albedo.b | normal.x (oct, [0,1] — training format) +// [2] normal.y | depth +// [3] dzdx | dzdy +// +// feat_tex1 (4 u32, u8norm — channel order from cnn_v3_enc0.wgsl load_feat): +// [0] mat_id | prev.r | prev.g | prev.b +// [1] mip1.r | mip1.g | mip1.b | mip2.r +// [2] mip2.g | mip2.b | dif | transp +// [3] 0 +// +// Note: normal.xy stored in [0,1] (training format), NOT remapped to [-1,1] +// like gbuf_pack.wgsl does at runtime. This matches infer_cnn_v3.py. +// --------------------------------------------------------------------------- - // Use appropriate format: RGBA16Float for intermediate, BGRA8Unorm for final - WGPUTextureFormat output_format = is_final_layer - ? WGPUTextureFormat_BGRA8Unorm - : WGPUTextureFormat_RGBA16Float; +struct FeatureImages { + int w, h; + std::vector<float> albedo; // w*h*3 [0,1] + std::vector<float> normal; // w*h*2 [0,1] oct-encoded + std::vector<float> depth; // w*h [0,1] + std::vector<float> matid; // w*h [0,1] + std::vector<float> shadow; // w*h [0,1] + std::vector<float> transp; // w*h [0,1] +}; - WGPURenderPipeline pipeline = - RenderPipelineBuilder(device) - .shader(shader_code) // compose=true by default - .bind_group_layout(bgl) - .format(output_format) - .build(); +static void pack_features(const FeatureImages& img, + std::vector<uint32_t>& feat0, // w*h*4 u32 + std::vector<uint32_t>& feat1) // w*h*4 u32 +{ + const int W = img.w, H = img.h; + feat0.resize(W * H * 4); + feat1.resize(W * H * 4); - wgpuBindGroupLayoutRelease(bgl); - return pipeline; -} + std::vector<float> mip1, mip2; + compute_mips(img.albedo.data(), W, H, mip1, mip2); -// Begin render pass with clear -static WGPURenderPassEncoder begin_render_pass(WGPUCommandEncoder encoder, - WGPUTextureView view) { - const WGPURenderPassColorAttachment color_attachment = { - .view = view, - .depthSlice = WGPU_DEPTH_SLICE_UNDEFINED, - .loadOp = WGPULoadOp_Clear, - .storeOp = WGPUStoreOp_Store, - .clearValue = {0.0f, 0.0f, 0.0f, 1.0f}, - }; - - const WGPURenderPassDescriptor pass_desc = { - .colorAttachmentCount = 1, - .colorAttachments = &color_attachment, - }; + static const float KEY_X = 0.408f, KEY_Y = 0.816f, KEY_Z = 0.408f; - return wgpuCommandEncoderBeginRenderPass(encoder, &pass_desc); -} + for (int y = 0; y < H; ++y) { + for (int x = 0; x < W; ++x) { + const int pi = y * W + x; + const int i3 = pi * 3; + const int i4 = pi * 4; -// Save PNG output -static bool save_png(const char* path, const std::vector<uint8_t>& pixels, - int width, int height) { - // Convert BGRA → RGBA - std::vector<uint8_t> rgba(width * height * 4); - for (int i = 0; i < width * height; ++i) { - rgba[i * 4 + 0] = pixels[i * 4 + 2]; // R - rgba[i * 4 + 1] = pixels[i * 4 + 1]; // G - rgba[i * 4 + 2] = pixels[i * 4 + 0]; // B - rgba[i * 4 + 3] = pixels[i * 4 + 3]; // A - } + float ar = img.albedo[i3 ]; + float ag = img.albedo[i3+1]; + float ab = img.albedo[i3+2]; - if (!stbi_write_png(path, width, height, 4, rgba.data(), width * 4)) { - fprintf(stderr, "Error: failed to write PNG '%s'\n", path); - return false; - } + float nx = img.normal[pi * 2 ]; // [0,1] + float ny = img.normal[pi * 2 + 1]; // [0,1] - return true; -} + float d = img.depth[pi]; -// Create horizontal grayscale composite of layer outputs -// Each layer is already 4x wide (showing 4 channels), stack them vertically -static bool save_layer_composite(const char* dir, int width, int height, - int num_layers) { - // Each layer PNG is already 4x wide with 4 channels side-by-side - int layer_width = width * 4; + // Central finite difference depth gradient + int xm = (x > 0) ? x-1 : 0; + int xp = (x < W-1) ? x+1 : W-1; + int ym = (y > 0) ? y-1 : 0; + int yp = (y < H-1) ? y+1 : H-1; + float dzdx = (img.depth[y * W + xp] - img.depth[y * W + xm]) * 0.5f; + float dzdy = (img.depth[yp * W + x ] - img.depth[ym * W + x ]) * 0.5f; - // Load all layer images (they're already grayscale) - std::vector<std::vector<uint8_t>> layers(num_layers); - for (int i = 0; i < num_layers; ++i) { - char path[512]; - snprintf(path, sizeof(path), "%s/layer_%d.png", dir, i); + float mat = img.matid[pi]; + float shad = img.shadow[pi]; + float trp = img.transp[pi]; - int w, h, channels; - uint8_t* data = stbi_load(path, &w, &h, &channels, 1); // Load as grayscale - if (!data || w != layer_width || h != height) { - if (data) - stbi_image_free(data); - fprintf(stderr, - "Warning: failed to load layer %d for composite (expected %dx%d, " - "got %dx%d)\n", - i, layer_width, height, w, h); - return false; - } + // Diffuse = max(0, dot(oct_decode(normal), KEY_LIGHT)) * shadow + float n3x, n3y, n3z; + oct_decode_01(nx, ny, &n3x, &n3y, &n3z); + float dif = fmaxf(0.0f, n3x*KEY_X + n3y*KEY_Y + n3z*KEY_Z) * shad; - layers[i].assign(data, data + (layer_width * height)); - stbi_image_free(data); - } + float m1r = mip1[i3 ], m1g = mip1[i3+1], m1b = mip1[i3+2]; + float m2r = mip2[i3 ], m2g = mip2[i3+1], m2b = mip2[i3+2]; - // Stack layers vertically - int composite_height = height * num_layers; - std::vector<uint8_t> composite(layer_width * composite_height); + // prev.rgb = 0 (no temporal history) + feat0[i4 ] = pack2x16f(ar, ag); + feat0[i4+1] = pack2x16f(ab, nx); + feat0[i4+2] = pack2x16f(ny, d ); + feat0[i4+3] = pack2x16f(dzdx, dzdy); - for (int layer = 0; layer < num_layers; ++layer) { - for (int y = 0; y < height; ++y) { - int src_row_offset = y * layer_width; - int dst_row_offset = (layer * height + y) * layer_width; - memcpy(&composite[dst_row_offset], &layers[layer][src_row_offset], - layer_width); + feat1[i4 ] = pack4x8u(mat, 0.0f, 0.0f, 0.0f); // mat_id, prev.rgb=0 + feat1[i4+1] = pack4x8u(m1r, m1g, m1b, m2r); + feat1[i4+2] = pack4x8u(m2g, m2b, dif, trp); + feat1[i4+3] = 0u; } } - - // Save as grayscale PNG (stacked vertically) - char composite_path[512]; - snprintf(composite_path, sizeof(composite_path), "%s/layers_composite.png", - dir); - if (!stbi_write_png(composite_path, layer_width, composite_height, 1, - composite.data(), layer_width)) { - fprintf(stderr, "Error: failed to write composite PNG\n"); - return false; - } - - printf("Saved layer composite to '%s' (%dx%d, 4 layers stacked vertically)\n", - composite_path, layer_width, composite_height); - return true; } -// Save PPM output (fallback) -static bool save_ppm(const char* path, const std::vector<uint8_t>& pixels, - int width, int height) { - FILE* f = fopen(path, "wb"); - if (!f) { - fprintf(stderr, "Error: failed to open '%s' for writing\n", path); - return false; - } - - fprintf(f, "P6\n%d %d\n255\n", width, height); - for (int i = 0; i < width * height; ++i) { - const uint8_t rgb[3] = {pixels[i * 4 + 2], // R - pixels[i * 4 + 1], // G - pixels[i * 4 + 0]}; // B - fwrite(rgb, 1, 3, f); - } +// --------------------------------------------------------------------------- +// GPU texture helpers +// --------------------------------------------------------------------------- - fclose(f); - return true; +static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) { + WGPUTextureDescriptor d = {}; + d.format = WGPUTextureFormat_RGBA32Uint; + d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; + d.dimension = WGPUTextureDimension_2D; + d.size = {(uint32_t)W, (uint32_t)H, 1}; + d.mipLevelCount = 1; + d.sampleCount = 1; + return wgpuDeviceCreateTexture(dev, &d); } -// CNN v2 structures (matching CNNv2Effect) -struct CNNv2LayerInfo { - uint32_t kernel_size; - uint32_t in_channels; - uint32_t out_channels; - uint32_t weight_offset; - uint32_t weight_count; -}; - -struct CNNv2LayerParams { - uint32_t kernel_size; - uint32_t in_channels; - uint32_t out_channels; - uint32_t weight_offset; - uint32_t is_output_layer; - float blend_amount; - uint32_t is_layer_0; -}; - -struct CNNv2StaticFeatureParams { - uint32_t mip_level; - uint32_t padding[3]; -}; - -// Convert RGBA32Uint (packed f16) texture to BGRA8Unorm -static std::vector<uint8_t> -readback_rgba32uint_to_bgra8(WGPUDevice device, WGPUQueue queue, - WGPUTexture texture, int width, int height) { - // Create staging buffer - const uint32_t bytes_per_row = width * 16; // 4×u32 per pixel - const uint32_t padded_bytes_per_row = (bytes_per_row + 255) & ~255; - const size_t buffer_size = padded_bytes_per_row * height; - - WGPUBufferDescriptor buffer_desc = {}; - buffer_desc.size = buffer_size; - buffer_desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; - buffer_desc.mappedAtCreation = false; +static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) { + WGPUTextureDescriptor d = {}; + d.format = WGPUTextureFormat_RGBA16Float; + d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; + d.dimension = WGPUTextureDimension_2D; + d.size = {(uint32_t)W, (uint32_t)H, 1}; + d.mipLevelCount = 1; + d.sampleCount = 1; + return wgpuDeviceCreateTexture(dev, &d); +} - WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc); +static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { + WGPUTextureViewDescriptor d = {}; + d.format = fmt; + d.dimension = WGPUTextureViewDimension_2D; + d.mipLevelCount = 1; + d.arrayLayerCount = 1; + return wgpuTextureCreateView(tex, &d); +} - // Copy texture to buffer - WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); +static void upload_tex(WGPUQueue queue, WGPUTexture tex, + const uint32_t* data, int W, int H) { + WGPUTexelCopyTextureInfo dst = {}; + dst.texture = tex; + WGPUTexelCopyBufferLayout layout = {}; + layout.bytesPerRow = (uint32_t)(W * 16); + layout.rowsPerImage = (uint32_t)H; + WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1}; + wgpuQueueWriteTexture(queue, &dst, data, (size_t)(W * H * 16), &layout, &ext); +} - WGPUTexelCopyTextureInfo src = {}; - src.texture = texture; - src.mipLevel = 0; +// --------------------------------------------------------------------------- +// RGBA16Float readback +// --------------------------------------------------------------------------- - WGPUTexelCopyBufferInfo dst = {}; - dst.buffer = staging; - dst.layout.bytesPerRow = padded_bytes_per_row; - dst.layout.rowsPerImage = height; +static uint16_t fp16_bits_to_f16(float f) { return f32_to_f16(f); } +static float fp16_bits_to_f32(uint16_t h) { + uint32_t sign = (uint32_t)(h & 0x8000u) << 16; + uint32_t exp = (h & 0x7C00u) >> 10; + uint32_t mant = h & 0x03FFu; + if (exp == 0 && mant == 0) { float r; memcpy(&r, &sign, 4); return r; } + if (exp == 31) { uint32_t b = sign | 0x7F800000u | (mant << 13); + float r; memcpy(&r, &b, 4); return r; } + uint32_t b = sign | ((exp + 112u) << 23) | (mant << 13); + float r; memcpy(&r, &b, 4); return r; +} - WGPUExtent3D copy_size = {(uint32_t)(width), (uint32_t)(height), 1}; +struct MapState { bool done = false; WGPUMapAsyncStatus status = {}; }; - wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, ©_size); +static std::vector<float> readback_rgba16f(WGPUDevice device, WGPUQueue queue, + WGPUTexture tex, int W, int H) { + const uint32_t bytes_per_px = 8; + const uint32_t raw_bpr = (uint32_t)(W * bytes_per_px); + const uint32_t aligned_bpr = ((raw_bpr + 255u) / 256u) * 256u; + const size_t buf_size = (size_t)aligned_bpr * (size_t)H; - WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); - wgpuQueueSubmit(queue, 1, &commands); - wgpuCommandBufferRelease(commands); - wgpuCommandEncoderRelease(encoder); + WGPUBufferDescriptor bd = {}; + bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; + bd.size = buf_size; + WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); - // Wait for copy to complete + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); + WGPUTexelCopyTextureInfo src = {}; src.texture = tex; + WGPUTexelCopyBufferInfo dst = {}; + dst.buffer = staging; + dst.layout.bytesPerRow = aligned_bpr; + dst.layout.rowsPerImage = (uint32_t)H; + WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1}; + wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &ext); + WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(queue, 1, &cmds); + wgpuCommandBufferRelease(cmds); + wgpuCommandEncoderRelease(enc); wgpuDevicePoll(device, true, nullptr); - // Map and read buffer - struct MapState { - bool done = false; + MapState ms = {}; + WGPUBufferMapCallbackInfo mi = {}; + mi.mode = WGPUCallbackMode_AllowProcessEvents; + mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { + auto* st = (MapState*)u; st->status = s; st->done = true; }; - MapState map_state; - - auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message, - void* userdata1, void* userdata2) { - (void)message; - (void)userdata2; - MapState* state = (MapState*)userdata1; - state->done = (status == WGPUMapAsyncStatus_Success); - }; - - WGPUBufferMapCallbackInfo map_info = {}; - map_info.mode = WGPUCallbackMode_AllowProcessEvents; - map_info.callback = map_cb; - map_info.userdata1 = &map_state; - - wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info); - - // Wait for mapping to complete - for (int i = 0; i < 100 && !map_state.done; ++i) { + mi.userdata1 = &ms; + wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); + for (int i = 0; i < 200 && !ms.done; ++i) wgpuDevicePoll(device, true, nullptr); - } - - if (!map_state.done) { - fprintf(stderr, "Error: Buffer mapping timed out\n"); - wgpuBufferRelease(staging); - return std::vector<uint8_t>(); - } - - const uint32_t* mapped = - (const uint32_t*)wgpuBufferGetConstMappedRange(staging, 0, buffer_size); - - std::vector<uint8_t> result(width * height * 4); - // Unpack f16 to u8 (BGRA) - for (int y = 0; y < height; ++y) { - const uint32_t* row = - (const uint32_t*)((const uint8_t*)mapped + y * padded_bytes_per_row); - for (int x = 0; x < width; ++x) { - // Read 4×u32 (8×f16) - uint32_t data[4]; - data[0] = row[x * 4 + 0]; - data[1] = row[x * 4 + 1]; - data[2] = row[x * 4 + 2]; - data[3] = row[x * 4 + 3]; - - // Extract RGBA channels (first 4 f16 values) - uint16_t r16 = data[0] & 0xFFFF; - uint16_t g16 = (data[0] >> 16) & 0xFFFF; - uint16_t b16 = data[1] & 0xFFFF; - uint16_t a16 = (data[1] >> 16) & 0xFFFF; - - // Convert f16 to f32 (simple decode) - auto f16_to_f32 = [](uint16_t h) -> float { - uint32_t sign = (h >> 15) & 1; - uint32_t exp = (h >> 10) & 0x1F; - uint32_t frac = h & 0x3FF; - - if (exp == 0) { - if (frac == 0) - return sign ? -0.0f : 0.0f; - // Denormal - float val = frac / 1024.0f / 16384.0f; - return sign ? -val : val; + std::vector<float> pixels(W * H * 4, 0.0f); + if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { + const uint8_t* mapped = (const uint8_t*) + wgpuBufferGetConstMappedRange(staging, 0, buf_size); + if (mapped) { + for (int y = 0; y < H; ++y) { + const uint16_t* row = (const uint16_t*)(mapped + (size_t)y * aligned_bpr); + for (int x = 0; x < W; ++x) { + for (int c = 0; c < 4; ++c) + pixels[(y * W + x) * 4 + c] = fp16_bits_to_f32(row[x * 4 + c]); } - if (exp == 31) { - return frac ? NAN : (sign ? -INFINITY : INFINITY); - } - - int32_t e = exp - 15; - float val = (1.0f + frac / 1024.0f) * powf(2.0f, e); - return sign ? -val : val; - }; - - float r = f16_to_f32(r16); - float g = f16_to_f32(g16); - float b = f16_to_f32(b16); - float a = f16_to_f32(a16); - - // Clamp to [0,1] and convert to u8 - auto clamp_u8 = [](float v) -> uint8_t { - if (v <= 0.0f) - return 0; - if (v >= 1.0f) - return 255; - return (uint8_t)(v * 255.0f + 0.5f); - }; - - result[(y * width + x) * 4 + 0] = clamp_u8(b); - result[(y * width + x) * 4 + 1] = clamp_u8(g); - result[(y * width + x) * 4 + 2] = clamp_u8(r); - result[(y * width + x) * 4 + 3] = clamp_u8(a); + } } } - wgpuBufferUnmap(staging); wgpuBufferRelease(staging); - - return result; + return pixels; } -// Read RGBA32Uint and create 4x wide grayscale composite (each channel -// side-by-side) -static std::vector<uint8_t> -readback_rgba32uint_to_composite(WGPUDevice device, WGPUQueue queue, - WGPUTexture texture, int width, int height) { - // First get BGRA8 data - std::vector<uint8_t> bgra = - readback_rgba32uint_to_bgra8(device, queue, texture, width, height); - if (bgra.empty()) - return {}; - - // Create 4x wide grayscale image (one channel per horizontal strip) - int composite_width = width * 4; - std::vector<uint8_t> composite(composite_width * height); - - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - int src_idx = (y * width + x) * 4; - uint8_t b = bgra[src_idx + 0]; - uint8_t g = bgra[src_idx + 1]; - uint8_t r = bgra[src_idx + 2]; - uint8_t a = bgra[src_idx + 3]; +// --------------------------------------------------------------------------- +// Image I/O helpers +// --------------------------------------------------------------------------- - // Convert each channel to grayscale luminance - auto to_gray = [](uint8_t val) -> uint8_t { return val; }; - - // Place each channel in its horizontal strip - composite[y * composite_width + (0 * width + x)] = - to_gray(r); // Channel 0 - composite[y * composite_width + (1 * width + x)] = - to_gray(g); // Channel 1 - composite[y * composite_width + (2 * width + x)] = - to_gray(b); // Channel 2 - composite[y * composite_width + (3 * width + x)] = - to_gray(a); // Channel 3 - } +static std::vector<float> load_png_rgb(const char* path, int* out_w, int* out_h) { + int w, h, ch; + uint8_t* data = stbi_load(path, &w, &h, &ch, 3); + if (!data) { + fprintf(stderr, "Error: cannot load '%s'\n", path); + return {}; } - - return composite; + *out_w = w; *out_h = h; + std::vector<float> out(w * h * 3); + for (int i = 0; i < w * h * 3; ++i) + out[i] = data[i] / 255.0f; + stbi_image_free(data); + return out; } -// Process image with CNN v2 -static bool process_cnn_v2(WGPUDevice device, WGPUQueue queue, - WGPUInstance instance, WGPUTexture input_texture, - int width, int height, const Args& args) { - printf("Using CNN v2 (storage buffer architecture)\n"); - - // Load weights (from file or asset system) - size_t weights_size = 0; - const uint8_t* weights_data = nullptr; - std::vector<uint8_t> file_weights; // For file-based loading - - if (args.weights_path) { - // Load from file - printf("Loading weights from '%s'...\n", args.weights_path); - FILE* f = fopen(args.weights_path, "rb"); - if (!f) { - fprintf(stderr, "Error: failed to open weights file '%s'\n", - args.weights_path); - return false; - } - - fseek(f, 0, SEEK_END); - weights_size = ftell(f); - fseek(f, 0, SEEK_SET); - - file_weights.resize(weights_size); - size_t read = fread(file_weights.data(), 1, weights_size, f); - fclose(f); - - if (read != weights_size) { - fprintf(stderr, "Error: failed to read weights file\n"); - return false; - } - - weights_data = file_weights.data(); - } else { - // Load from asset system - weights_data = - (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size); +// Load 2-channel (RG) from RGB PNG — takes first 2 channels +static std::vector<float> load_png_rg(const char* path, int ew, int eh) { + int w, h, ch; + uint8_t* data = stbi_load(path, &w, &h, &ch, 3); + if (!data || w != ew || h != eh) { + if (data) stbi_image_free(data); + fprintf(stderr, "Warning: cannot load normal '%s' — using (0.5,0.5)\n", path); + std::vector<float> def(ew * eh * 2, 0.5f); + return def; } - - if (!weights_data || weights_size < 20) { - fprintf(stderr, "Error: CNN v2 weights not available\n"); - return false; + std::vector<float> out(w * h * 2); + for (int i = 0; i < w * h; ++i) { + out[i * 2 ] = data[i * 3 ] / 255.0f; + out[i * 2 + 1] = data[i * 3 + 1] / 255.0f; } + stbi_image_free(data); + return out; +} - // Parse header - const uint32_t* header = (const uint32_t*)weights_data; - uint32_t magic = header[0]; - uint32_t version = header[1]; - uint32_t num_layers = header[2]; - uint32_t total_weights = header[3]; - - if (magic != 0x324e4e43) { // 'CNN2' - fprintf(stderr, "Error: Invalid CNN v2 weights magic\n"); - return false; +// Load 16-bit greyscale PNG → [0,1] +static std::vector<float> load_png_depth16(const char* path, int ew, int eh) { + int w, h, ch; + uint16_t* data = stbi_load_16(path, &w, &h, &ch, 1); + if (!data || w != ew || h != eh) { + if (data) stbi_image_free(data); + fprintf(stderr, "Warning: cannot load depth '%s' — using 0\n", path); + return std::vector<float>(ew * eh, 0.0f); } + std::vector<float> out(w * h); + for (int i = 0; i < w * h; ++i) + out[i] = data[i] / 65535.0f; + stbi_image_free(data); + return out; +} - uint32_t mip_level = 0; - if (version == 2) { - mip_level = header[4]; +// Load 8-bit greyscale PNG → [0,1] +static std::vector<float> load_png_gray(const char* path, int ew, int eh, + float default_val = 0.0f) { + int w, h, ch; + uint8_t* data = stbi_load(path, &w, &h, &ch, 1); + if (!data || w != ew || h != eh) { + if (data) stbi_image_free(data); + return std::vector<float>(ew * eh, default_val); } + std::vector<float> out(w * h); + for (int i = 0; i < w * h; ++i) + out[i] = data[i] / 255.0f; + stbi_image_free(data); + return out; +} - printf("Loaded CNN v2 weights: %u layers, %u weights, version %u\n", - num_layers, total_weights, version); - - // Parse layer info - const uint32_t header_u32_count = (version == 1) ? 4 : 5; - const uint32_t* layer_data = header + header_u32_count; - std::vector<CNNv2LayerInfo> layer_info; - - for (uint32_t i = 0; i < num_layers; ++i) { - CNNv2LayerInfo info; - info.kernel_size = layer_data[i * 5 + 0]; - info.in_channels = layer_data[i * 5 + 1]; - info.out_channels = layer_data[i * 5 + 2]; - info.weight_offset = layer_data[i * 5 + 3]; - info.weight_count = layer_data[i * 5 + 4]; - layer_info.push_back(info); - - printf(" Layer %u: %ux%u conv, %u→%u channels, %u weights\n", i, - info.kernel_size, info.kernel_size, info.in_channels, - info.out_channels, info.weight_count); +static bool save_png(const char* path, const std::vector<float>& rgba_f32, + int w, int h) { + std::vector<uint8_t> rgba8(w * h * 4); + for (int i = 0; i < w * h * 4; ++i) { + int v = (int)(rgba_f32[i] * 255.0f + 0.5f); + rgba8[i] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } - - // Create weights storage buffer (skip header + layer info, upload only - // weights) - size_t header_size = 20; // 5 u32 - size_t layer_info_size = 20 * layer_info.size(); // 5 u32 per layer - size_t weights_offset = header_size + layer_info_size; - size_t weights_only_size = weights_size - weights_offset; - - WGPUBufferDescriptor weights_buffer_desc = {}; - weights_buffer_desc.size = weights_only_size; - weights_buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst; - weights_buffer_desc.mappedAtCreation = false; - - WGPUBuffer weights_buffer = - wgpuDeviceCreateBuffer(device, &weights_buffer_desc); - wgpuQueueWriteBuffer(queue, weights_buffer, 0, weights_data + weights_offset, - weights_only_size); - - // Create input view - WGPUTextureView input_view = - gpu_create_texture_view_2d(input_texture, WGPUTextureFormat_BGRA8Unorm); - - // Create static features texture (RGBA32Uint) - const WGPUTextureDescriptor static_desc = { - .usage = WGPUTextureUsage_StorageBinding | - WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc, - .dimension = WGPUTextureDimension_2D, - .size = {(uint32_t)(width), (uint32_t)(height), 1}, - .format = WGPUTextureFormat_RGBA32Uint, - .mipLevelCount = 1, - .sampleCount = 1, - }; - WGPUTexture static_features_tex = - wgpuDeviceCreateTexture(device, &static_desc); - WGPUTextureView static_features_view = - wgpuTextureCreateView(static_features_tex, nullptr); - - // Load depth from input alpha channel (or 1.0 if no alpha) - WGPUTexture depth_texture = - load_depth_from_alpha(device, queue, args.input_path, width, height); - if (!depth_texture) { - wgpuTextureViewRelease(static_features_view); - wgpuTextureRelease(static_features_tex); - wgpuBufferRelease(weights_buffer); - wgpuTextureViewRelease(input_view); + if (!stbi_write_png(path, w, h, 4, rgba8.data(), w * 4)) { + fprintf(stderr, "Error: failed to write '%s'\n", path); return false; } - WGPUTextureView depth_view = wgpuTextureCreateView(depth_texture, nullptr); - - // Create layer textures (ping-pong) - WGPUTexture layer_textures[2] = { - wgpuDeviceCreateTexture(device, &static_desc), - wgpuDeviceCreateTexture(device, &static_desc), - }; - WGPUTextureView layer_views[2] = { - wgpuTextureCreateView(layer_textures[0], nullptr), - wgpuTextureCreateView(layer_textures[1], nullptr), - }; + return true; +} - // Load shaders - const char* static_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC); - const char* layer_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE); +// --------------------------------------------------------------------------- +// Weight loading +// --------------------------------------------------------------------------- - if (!static_shader[0] || !layer_shader[0]) { - fprintf(stderr, "Error: CNN v2 shaders not available\n"); - wgpuTextureViewRelease(static_features_view); - wgpuTextureRelease(static_features_tex); - wgpuTextureViewRelease(depth_view); - wgpuTextureRelease(depth_texture); - wgpuTextureViewRelease(layer_views[0]); - wgpuTextureViewRelease(layer_views[1]); - wgpuTextureRelease(layer_textures[0]); - wgpuTextureRelease(layer_textures[1]); - wgpuBufferRelease(weights_buffer); - wgpuTextureViewRelease(input_view); +static bool load_weights_bin(const char* path, std::vector<uint32_t>& out) { + FILE* f = fopen(path, "rb"); + if (!f) { + fprintf(stderr, "Error: cannot open weights '%s'\n", path); return false; } - - // Create static feature params buffer - WGPUBufferDescriptor static_params_desc = {}; - static_params_desc.size = sizeof(CNNv2StaticFeatureParams); - static_params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; - static_params_desc.mappedAtCreation = false; - - WGPUBuffer static_params_buffer = - wgpuDeviceCreateBuffer(device, &static_params_desc); - - CNNv2StaticFeatureParams static_params; - static_params.mip_level = mip_level; - static_params.padding[0] = 0; - static_params.padding[1] = 0; - static_params.padding[2] = 0; - wgpuQueueWriteBuffer(queue, static_params_buffer, 0, &static_params, - sizeof(static_params)); - - // Create linear sampler for bilinear interpolation - WGPUSamplerDescriptor linear_sampler_desc = {}; - linear_sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge; - linear_sampler_desc.addressModeV = WGPUAddressMode_ClampToEdge; - linear_sampler_desc.addressModeW = WGPUAddressMode_ClampToEdge; - linear_sampler_desc.magFilter = WGPUFilterMode_Linear; - linear_sampler_desc.minFilter = WGPUFilterMode_Linear; - linear_sampler_desc.mipmapFilter = WGPUMipmapFilterMode_Linear; - linear_sampler_desc.lodMinClamp = 0.0f; - linear_sampler_desc.lodMaxClamp = 32.0f; - linear_sampler_desc.maxAnisotropy = 1; - - WGPUSampler linear_sampler = - wgpuDeviceCreateSampler(device, &linear_sampler_desc); - - // Create static features compute pipeline - WGPUShaderSourceWGSL static_wgsl = {}; - static_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL; - static_wgsl.code = str_view(static_shader); - - WGPUShaderModuleDescriptor static_module_desc = {}; - static_module_desc.nextInChain = &static_wgsl.chain; - - WGPUShaderModule static_module = - wgpuDeviceCreateShaderModule(device, &static_module_desc); - - // Bind group layout: 0=input, 1=input_mip1, 2=input_mip2, 3=depth, 4=output, - // 5=params, 6=linear_sampler - WGPUBindGroupLayoutEntry static_bgl_entries[7] = {}; - static_bgl_entries[0].binding = 0; - static_bgl_entries[0].visibility = WGPUShaderStage_Compute; - static_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float; - static_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D; - - static_bgl_entries[1].binding = 1; - static_bgl_entries[1].visibility = WGPUShaderStage_Compute; - static_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float; - static_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D; - - static_bgl_entries[2].binding = 2; - static_bgl_entries[2].visibility = WGPUShaderStage_Compute; - static_bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float; - static_bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D; - - static_bgl_entries[3].binding = 3; - static_bgl_entries[3].visibility = WGPUShaderStage_Compute; - static_bgl_entries[3].texture.sampleType = - WGPUTextureSampleType_UnfilterableFloat; - static_bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D; - - static_bgl_entries[4].binding = 4; - static_bgl_entries[4].visibility = WGPUShaderStage_Compute; - static_bgl_entries[4].storageTexture.access = - WGPUStorageTextureAccess_WriteOnly; - static_bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint; - static_bgl_entries[4].storageTexture.viewDimension = - WGPUTextureViewDimension_2D; - - static_bgl_entries[5].binding = 5; - static_bgl_entries[5].visibility = WGPUShaderStage_Compute; - static_bgl_entries[5].buffer.type = WGPUBufferBindingType_Uniform; - static_bgl_entries[5].buffer.minBindingSize = - sizeof(CNNv2StaticFeatureParams); - - static_bgl_entries[6].binding = 6; - static_bgl_entries[6].visibility = WGPUShaderStage_Compute; - static_bgl_entries[6].sampler.type = WGPUSamplerBindingType_Filtering; - - WGPUBindGroupLayoutDescriptor static_bgl_desc = {}; - static_bgl_desc.entryCount = 7; - static_bgl_desc.entries = static_bgl_entries; - - WGPUBindGroupLayout static_bgl = - wgpuDeviceCreateBindGroupLayout(device, &static_bgl_desc); - - WGPUPipelineLayoutDescriptor static_pl_desc = {}; - static_pl_desc.bindGroupLayoutCount = 1; - static_pl_desc.bindGroupLayouts = &static_bgl; - - WGPUPipelineLayout static_pl = - wgpuDeviceCreatePipelineLayout(device, &static_pl_desc); - - WGPUComputePipelineDescriptor static_pipeline_desc = {}; - static_pipeline_desc.compute.module = static_module; - static_pipeline_desc.compute.entryPoint = str_view("main"); - static_pipeline_desc.layout = static_pl; - - WGPUComputePipeline static_pipeline = - wgpuDeviceCreateComputePipeline(device, &static_pipeline_desc); - - wgpuShaderModuleRelease(static_module); - wgpuPipelineLayoutRelease(static_pl); - - // Create static bind group (use input as all mips for simplicity) - WGPUBindGroupEntry static_bg_entries[7] = {}; - static_bg_entries[0].binding = 0; - static_bg_entries[0].textureView = input_view; - static_bg_entries[1].binding = 1; - static_bg_entries[1].textureView = input_view; - static_bg_entries[2].binding = 2; - static_bg_entries[2].textureView = input_view; - static_bg_entries[3].binding = 3; - static_bg_entries[3].textureView = - depth_view; // Depth from alpha channel (matches training) - static_bg_entries[4].binding = 4; - static_bg_entries[4].textureView = static_features_view; - static_bg_entries[5].binding = 5; - static_bg_entries[5].buffer = static_params_buffer; - static_bg_entries[5].size = sizeof(CNNv2StaticFeatureParams); - static_bg_entries[6].binding = 6; - static_bg_entries[6].sampler = linear_sampler; - - WGPUBindGroupDescriptor static_bg_desc = {}; - static_bg_desc.layout = static_bgl; - static_bg_desc.entryCount = 7; - static_bg_desc.entries = static_bg_entries; - - WGPUBindGroup static_bg = wgpuDeviceCreateBindGroup(device, &static_bg_desc); - - wgpuBindGroupLayoutRelease(static_bgl); - - // Create layer compute pipeline - WGPUShaderSourceWGSL layer_wgsl = {}; - layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL; - layer_wgsl.code = str_view(layer_shader); - - WGPUShaderModuleDescriptor layer_module_desc = {}; - layer_module_desc.nextInChain = &layer_wgsl.chain; - - WGPUShaderModule layer_module = - wgpuDeviceCreateShaderModule(device, &layer_module_desc); - - // Layer bind group layout: - // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params, - // 5=original - WGPUBindGroupLayoutEntry layer_bgl_entries[6] = {}; - layer_bgl_entries[0].binding = 0; - layer_bgl_entries[0].visibility = WGPUShaderStage_Compute; - layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint; - layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D; - - layer_bgl_entries[1].binding = 1; - layer_bgl_entries[1].visibility = WGPUShaderStage_Compute; - layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint; - layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D; - - layer_bgl_entries[2].binding = 2; - layer_bgl_entries[2].visibility = WGPUShaderStage_Compute; - layer_bgl_entries[2].storageTexture.access = - WGPUStorageTextureAccess_WriteOnly; - layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint; - layer_bgl_entries[2].storageTexture.viewDimension = - WGPUTextureViewDimension_2D; - - layer_bgl_entries[3].binding = 3; - layer_bgl_entries[3].visibility = WGPUShaderStage_Compute; - layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; - - layer_bgl_entries[4].binding = 4; - layer_bgl_entries[4].visibility = WGPUShaderStage_Compute; - layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform; - layer_bgl_entries[4].buffer.minBindingSize = sizeof(CNNv2LayerParams); - - layer_bgl_entries[5].binding = 5; - layer_bgl_entries[5].visibility = WGPUShaderStage_Compute; - layer_bgl_entries[5].texture.sampleType = WGPUTextureSampleType_Float; - layer_bgl_entries[5].texture.viewDimension = WGPUTextureViewDimension_2D; - - WGPUBindGroupLayoutDescriptor layer_bgl_desc = {}; - layer_bgl_desc.entryCount = 6; - layer_bgl_desc.entries = layer_bgl_entries; - - WGPUBindGroupLayout layer_bgl = - wgpuDeviceCreateBindGroupLayout(device, &layer_bgl_desc); - - WGPUPipelineLayoutDescriptor layer_pl_desc = {}; - layer_pl_desc.bindGroupLayoutCount = 1; - layer_pl_desc.bindGroupLayouts = &layer_bgl; - - WGPUPipelineLayout layer_pl = - wgpuDeviceCreatePipelineLayout(device, &layer_pl_desc); - - WGPUComputePipelineDescriptor layer_pipeline_desc = {}; - layer_pipeline_desc.compute.module = layer_module; - layer_pipeline_desc.compute.entryPoint = str_view("main"); - layer_pipeline_desc.layout = layer_pl; - - WGPUComputePipeline layer_pipeline = - wgpuDeviceCreateComputePipeline(device, &layer_pipeline_desc); - - wgpuShaderModuleRelease(layer_module); - wgpuPipelineLayoutRelease(layer_pl); - - // Create layer params buffers - std::vector<WGPUBuffer> layer_params_buffers; - for (size_t i = 0; i < layer_info.size(); ++i) { - WGPUBufferDescriptor params_desc = {}; - params_desc.size = sizeof(CNNv2LayerParams); - params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; - params_desc.mappedAtCreation = false; - - WGPUBuffer buf = wgpuDeviceCreateBuffer(device, ¶ms_desc); - layer_params_buffers.push_back(buf); - } - - // Execute compute passes - WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); - - // Pass 1: Static features - printf("Computing static features...\n"); - WGPUComputePassEncoder static_pass = - wgpuCommandEncoderBeginComputePass(encoder, nullptr); - wgpuComputePassEncoderSetPipeline(static_pass, static_pipeline); - wgpuComputePassEncoderSetBindGroup(static_pass, 0, static_bg, 0, nullptr); - - uint32_t workgroups_x = (width + 7) / 8; - uint32_t workgroups_y = (height + 7) / 8; - wgpuComputePassEncoderDispatchWorkgroups(static_pass, workgroups_x, - workgroups_y, 1); - - wgpuComputePassEncoderEnd(static_pass); - wgpuComputePassEncoderRelease(static_pass); - - // Save static features if requested - if (args.save_intermediates) { - // Submit and wait for static features to complete - WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr); - wgpuQueueSubmit(queue, 1, &cmd); - wgpuCommandBufferRelease(cmd); - wgpuDevicePoll(device, true, nullptr); - - // Create new encoder for layers - encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); - - char layer_path[512]; - snprintf(layer_path, sizeof(layer_path), "%s/static_features.png", - args.save_intermediates); - printf("Saving static features to '%s'...\n", layer_path); - - // Read back RGBA32Uint and create 8-channel grayscale composite - // Static features has 8 channels (packed as 4×u32), create 8x wide - // composite - std::vector<uint8_t> bgra = readback_rgba32uint_to_bgra8( - device, queue, static_features_tex, width, height); - - if (!bgra.empty()) { - // Static features: 8 f16 values packed in 4×u32 - // For now, just show first 4 channels (like layers) - // TODO: Show all 8 channels in 8x wide composite - std::vector<uint8_t> composite = readback_rgba32uint_to_composite( - device, queue, static_features_tex, width, height); - if (!composite.empty()) { - int composite_width = width * 4; - if (!stbi_write_png(layer_path, composite_width, height, 1, - composite.data(), composite_width)) { - fprintf(stderr, "Error: failed to write static features PNG\n"); - } - } - } + fseek(f, 0, SEEK_END); + long sz = ftell(f); + rewind(f); + if (sz <= 0 || sz % 4 != 0) { + fprintf(stderr, "Error: bad weights file size %ld\n", sz); + fclose(f); + return false; } - - // Pass 2-N: CNN layers - for (size_t i = 0; i < layer_info.size(); ++i) { - const CNNv2LayerInfo& info = layer_info[i]; - - printf("Processing layer %zu/%zu (%ux%u, %u→%u channels)...\n", i + 1, - layer_info.size(), info.kernel_size, info.kernel_size, - info.in_channels, info.out_channels); - - // Update layer params - CNNv2LayerParams params; - params.kernel_size = info.kernel_size; - params.in_channels = info.in_channels; - params.out_channels = info.out_channels; - params.weight_offset = info.weight_offset; - params.is_output_layer = (i == layer_info.size() - 1) ? 1 : 0; - params.blend_amount = args.blend; - params.is_layer_0 = (i == 0) ? 1 : 0; - - wgpuQueueWriteBuffer(queue, layer_params_buffers[i], 0, ¶ms, - sizeof(params)); - - // Create bind group for this layer - WGPUBindGroupEntry layer_bg_entries[6] = {}; - layer_bg_entries[0].binding = 0; - layer_bg_entries[0].textureView = static_features_view; - - layer_bg_entries[1].binding = 1; - layer_bg_entries[1].textureView = - (i == 0) ? static_features_view : layer_views[i % 2]; - - layer_bg_entries[2].binding = 2; - layer_bg_entries[2].textureView = layer_views[(i + 1) % 2]; - - layer_bg_entries[3].binding = 3; - layer_bg_entries[3].buffer = weights_buffer; - layer_bg_entries[3].size = weights_only_size; - - layer_bg_entries[4].binding = 4; - layer_bg_entries[4].buffer = layer_params_buffers[i]; - layer_bg_entries[4].size = sizeof(CNNv2LayerParams); - - layer_bg_entries[5].binding = 5; - layer_bg_entries[5].textureView = input_view; - - WGPUBindGroupDescriptor layer_bg_desc = {}; - layer_bg_desc.layout = layer_bgl; - layer_bg_desc.entryCount = 6; - layer_bg_desc.entries = layer_bg_entries; - - WGPUBindGroup layer_bg = wgpuDeviceCreateBindGroup(device, &layer_bg_desc); - - WGPUComputePassEncoder layer_pass = - wgpuCommandEncoderBeginComputePass(encoder, nullptr); - wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline); - wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bg, 0, nullptr); - - wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x, - workgroups_y, 1); - - wgpuComputePassEncoderEnd(layer_pass); - wgpuComputePassEncoderRelease(layer_pass); - wgpuBindGroupRelease(layer_bg); - - // Save intermediate layer if requested - if (args.save_intermediates) { - // Submit and wait for layer to complete - WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr); - wgpuQueueSubmit(queue, 1, &cmd); - wgpuCommandBufferRelease(cmd); - wgpuDevicePoll(device, true, nullptr); - - // Create new encoder for next layer - encoder = wgpuDeviceCreateCommandEncoder(device, nullptr); - - char layer_path[512]; - snprintf(layer_path, sizeof(layer_path), "%s/layer_%zu.png", - args.save_intermediates, i); - printf("Saving intermediate layer %zu to '%s'...\n", i, layer_path); - - // Read back RGBA32Uint and create 4-channel grayscale composite - WGPUTexture output_tex = layer_textures[(i + 1) % 2]; - std::vector<uint8_t> composite = readback_rgba32uint_to_composite( - device, queue, output_tex, width, height); - - if (!composite.empty()) { - int composite_width = width * 4; - if (!stbi_write_png(layer_path, composite_width, height, 1, - composite.data(), composite_width)) { - fprintf(stderr, "Error: failed to write layer PNG\n"); - } - } - } + out.resize((size_t)sz / 4); + if ((long)fread(out.data(), 4, out.size(), f) != sz / 4) { + fprintf(stderr, "Error: read failed for '%s'\n", path); + fclose(f); + return false; } + fclose(f); + return true; +} - WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); - wgpuQueueSubmit(queue, 1, &commands); - wgpuCommandBufferRelease(commands); - wgpuCommandEncoderRelease(encoder); - - wgpuDevicePoll(device, true, nullptr); - - // Create layer composite if intermediates were saved - if (args.save_intermediates) { - save_layer_composite(args.save_intermediates, width, height, - layer_info.size()); - } +// --------------------------------------------------------------------------- +// Args +// --------------------------------------------------------------------------- - // Readback final result (from last layer's output texture) - printf("Reading pixels from GPU...\n"); - size_t final_layer_idx = (layer_info.size()) % 2; - std::vector<uint8_t> pixels = readback_rgba32uint_to_bgra8( - device, queue, layer_textures[final_layer_idx], width, height); +struct Args { + const char* input_path = nullptr; + const char* output_path = nullptr; + const char* sample_dir = nullptr; + const char* weights_path = nullptr; + bool debug_hex = false; +}; - if (pixels.empty()) { - fprintf(stderr, "Error: GPU readback failed\n"); - for (auto buf : layer_params_buffers) - wgpuBufferRelease(buf); - wgpuComputePipelineRelease(layer_pipeline); - wgpuBindGroupLayoutRelease(layer_bgl); - wgpuBindGroupRelease(static_bg); - wgpuComputePipelineRelease(static_pipeline); - wgpuBufferRelease(static_params_buffer); - wgpuTextureViewRelease(static_features_view); - wgpuTextureRelease(static_features_tex); - wgpuTextureViewRelease(depth_view); - wgpuTextureRelease(depth_texture); - wgpuTextureViewRelease(layer_views[0]); - wgpuTextureViewRelease(layer_views[1]); - wgpuTextureRelease(layer_textures[0]); - wgpuTextureRelease(layer_textures[1]); - wgpuBufferRelease(weights_buffer); - wgpuTextureViewRelease(input_view); - return false; - } +static void print_usage(const char* prog) { + fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog); + fprintf(stderr, "\nOPTIONS:\n"); + fprintf(stderr, " --sample-dir DIR Full sample dir with albedo/normal/depth/matid/shadow/transp\n"); + fprintf(stderr, " --weights FILE Load weights from cnn_v3_weights.bin\n"); + fprintf(stderr, " --debug-hex Print first 8 output pixels as hex\n"); + fprintf(stderr, " --help Show this help\n"); + fprintf(stderr, "\nSimple mode (single PNG): geometry channels zeroed, normal=(0.5,0.5).\n"); + fprintf(stderr, "FiLM is always identity (gamma=1, beta=0).\n"); + fprintf(stderr, "\nNote: feature packing uses [0,1] oct-normals (training format) to match\n"); + fprintf(stderr, " infer_cnn_v3.py for direct Python/WGSL comparison.\n"); +} - // Debug hex dump - if (args.debug_hex) { - printf("First 8 pixels (BGRA hex):\n"); - for (int i = 0; i < 8 && i < width * height; ++i) { - const uint8_t b = pixels[i * 4 + 0]; - const uint8_t g = pixels[i * 4 + 1]; - const uint8_t r = pixels[i * 4 + 2]; - const uint8_t a = pixels[i * 4 + 3]; - printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a); +static bool parse_args(int argc, char** argv, Args* args) { + if (argc < 3) return false; + args->input_path = argv[1]; + args->output_path = argv[2]; + for (int i = 3; i < argc; ++i) { + if (strcmp(argv[i], "--sample-dir") == 0 && i + 1 < argc) { + args->sample_dir = argv[++i]; + } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) { + args->weights_path = argv[++i]; + } else if (strcmp(argv[i], "--debug-hex") == 0) { + args->debug_hex = true; + } else if (strcmp(argv[i], "--help") == 0) { + return false; + } else { + fprintf(stderr, "Error: unknown option '%s'\n", argv[i]); + return false; } } + return true; +} - // Save output - bool success; - if (args.output_png) { - printf("Saving PNG to '%s'...\n", args.output_path); - success = save_png(args.output_path, pixels, width, height); - } else { - printf("Saving PPM to '%s'...\n", args.output_path); - success = save_ppm(args.output_path, pixels, width, height); - } - - if (success) { - printf("Done! Output saved to '%s'\n", args.output_path); - } +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- - // Cleanup - for (auto buf : layer_params_buffers) - wgpuBufferRelease(buf); - wgpuComputePipelineRelease(layer_pipeline); - wgpuBindGroupLayoutRelease(layer_bgl); - wgpuBindGroupRelease(static_bg); - wgpuComputePipelineRelease(static_pipeline); - wgpuBufferRelease(static_params_buffer); - wgpuTextureViewRelease(static_features_view); - wgpuTextureRelease(static_features_tex); - wgpuTextureViewRelease(layer_views[0]); - wgpuTextureViewRelease(layer_views[1]); - wgpuTextureRelease(layer_textures[0]); - wgpuTextureRelease(layer_textures[1]); - wgpuBufferRelease(weights_buffer); - wgpuTextureViewRelease(input_view); - - return success; -} +extern void InitShaderComposer(); int main(int argc, char** argv) { - // Parse arguments Args args; if (!parse_args(argc, argv, &args)) { print_usage(argv[0]); return 1; } - // Initialize shader composer (required for #include resolution) - InitShaderComposer(); - - // Initialize WebGPU + // Init GPU WebGPUTestFixture fixture; if (!fixture.init()) { - fprintf(stderr, "Error: GPU unavailable\n"); + fprintf(stderr, "Error: WebGPU device unavailable\n"); return 1; } + InitShaderComposer(); GpuContext ctx = fixture.ctx(); - WGPUDevice device = ctx.device; - WGPUQueue queue = ctx.queue; - WGPUInstance instance = fixture.instance(); - // Load input texture - int width, height; - WGPUTexture input_texture = - load_texture(device, queue, args.input_path, &width, &height); - if (!input_texture) { - SamplerCache::Get().clear(); - fixture.shutdown(); - return 1; - } - - printf("Loaded %dx%d image from '%s'\n", width, height, args.input_path); + // --- Load input image --- + int W, H; + std::vector<float> albedo = load_png_rgb(args.input_path, &W, &H); + if (albedo.empty()) return 1; - // Branch based on CNN version - if (args.cnn_version == 2) { - bool success = process_cnn_v2(device, queue, instance, input_texture, width, - height, args); - wgpuTextureRelease(input_texture); - SamplerCache::Get().clear(); - fixture.shutdown(); - return success ? 0 : 1; + // Pad to multiples of 4 (U-Net requires 2 pooling levels) + const int W4 = (W + 3) & ~3; + const int H4 = (H + 3) & ~3; + if (W4 != W || H4 != H) { + printf("Padding %dx%d → %dx%d\n", W, H, W4, H4); + std::vector<float> padded(W4 * H4 * 3, 0.0f); + for (int y = 0; y < H; ++y) + for (int x = 0; x < W; ++x) + for (int c = 0; c < 3; ++c) + padded[(y * W4 + x) * 3 + c] = albedo[(y * W + x) * 3 + c]; + albedo = std::move(padded); + W = W4; H = H4; } - // CNN v1 processing below - printf("Using CNN v1 (render pipeline architecture)\n"); + printf("Input: %s (%dx%d)\n", args.input_path, W, H); - // Create input texture view - WGPUTextureView input_view = - gpu_create_texture_view_2d(input_texture, WGPUTextureFormat_BGRA8Unorm); - WGPUTextureView original_view = input_view; // Keep reference to original + // --- Build FeatureImages --- + FeatureImages img; + img.w = W; img.h = H; + img.albedo = albedo; - // Create CNN pipelines (different formats for intermediate vs final) - WGPURenderPipeline pipeline_intermediate = - create_cnn_pipeline(device, WGPUTextureFormat_RGBA16Float, false); - WGPURenderPipeline pipeline_final = - create_cnn_pipeline(device, WGPUTextureFormat_BGRA8Unorm, true); - - if (!pipeline_intermediate || !pipeline_final) { - fprintf(stderr, "Error: failed to create CNN pipelines\n"); - if (pipeline_intermediate) - wgpuRenderPipelineRelease(pipeline_intermediate); - if (pipeline_final) - wgpuRenderPipelineRelease(pipeline_final); - wgpuTextureViewRelease(input_view); - wgpuTextureRelease(input_texture); - SamplerCache::Get().clear(); - fixture.shutdown(); - return 1; - } - - // Get bind group layout from intermediate pipeline (same for both) - WGPUBindGroupLayout bgl = - wgpuRenderPipelineGetBindGroupLayout(pipeline_intermediate, 0); - - // Create uniform buffers - const WGPUBufferDescriptor common_uniform_desc = { - .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, - .size = sizeof(UniformsSequenceParams), - }; - WGPUBuffer common_uniform_buffer = - wgpuDeviceCreateBuffer(device, &common_uniform_desc); - - const WGPUBufferDescriptor layer_params_desc = { - .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst, - .size = sizeof(CNNv1LayerParams), - }; - WGPUBuffer layer_params_buffer = - wgpuDeviceCreateBuffer(device, &layer_params_desc); - - // Create intermediate textures for ping-pong (2 textures) - // Use RGBA16Float to preserve [-1,1] range from tanh activation - const WGPUTextureDescriptor intermediate_desc = { - .usage = WGPUTextureUsage_TextureBinding | - WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_CopySrc, - .dimension = WGPUTextureDimension_2D, - .size = {(uint32_t)(width), (uint32_t)(height), 1}, - .format = WGPUTextureFormat_RGBA16Float, - .mipLevelCount = 1, - .sampleCount = 1, - }; - - WGPUTexture intermediate_textures[2] = { - wgpuDeviceCreateTexture(device, &intermediate_desc), - wgpuDeviceCreateTexture(device, &intermediate_desc), - }; - - // Create views for intermediate textures (RGBA16Float) - WGPUTextureView intermediate_views[2] = { - gpu_create_texture_view_2d(intermediate_textures[0], - WGPUTextureFormat_RGBA16Float), - gpu_create_texture_view_2d(intermediate_textures[1], - WGPUTextureFormat_RGBA16Float), - }; - - // Get sampler - WGPUSampler sampler = - SamplerCache::Get().get_or_create(device, SamplerCache::clamp()); - - // Multi-layer processing - const int NUM_LAYERS = args.num_layers; - int dst_idx = 0; // Index of texture to render to - - // First layer reads from input, subsequent layers read from previous output - WGPUTextureView current_input = input_view; - - for (int layer = 0; layer < NUM_LAYERS; ++layer) { - printf("Processing layer %d/%d...\n", layer + 1, NUM_LAYERS); - - // Update uniforms - UniformsSequenceParams common_u = { - .resolution = {(float)(width), (float)(height)}, - .aspect_ratio = (float)(width) / (float)(height), - .time = 0.0f, - .beat_time = 0.0f, - .beat_phase = 0.0f, - .audio_intensity = 0.0f, - .noise = 0.0f, - }; - wgpuQueueWriteBuffer(queue, common_uniform_buffer, 0, &common_u, - sizeof(common_u)); - - CNNv1LayerParams layer_params = { - .layer_index = layer, - .blend_amount = - (layer == NUM_LAYERS - 1) ? args.blend : 1.0f, // Only final layer - ._pad = {0.0f, 0.0f}, + if (args.sample_dir) { + printf("Mode: full (%s)\n", args.sample_dir); + auto path = [&](const char* name) -> std::string { + return std::string(args.sample_dir) + "/" + name; }; - wgpuQueueWriteBuffer(queue, layer_params_buffer, 0, &layer_params, - sizeof(layer_params)); - - // Build bind group - WGPUBindGroup bind_group = - BindGroupBuilder() - .sampler(0, sampler) - .texture(1, current_input) - .buffer(2, common_uniform_buffer, sizeof(UniformsSequenceParams)) - .buffer(3, layer_params_buffer, sizeof(CNNv1LayerParams)) - .texture(4, original_view) - .build(device, bgl); - - // Render to appropriate output texture with correct pipeline - bool is_final = (layer == NUM_LAYERS - 1); - - if (is_final) { - // Final layer: use OffscreenRenderTarget (known working readback) - OffscreenRenderTarget rt(instance, device, width, height); - WGPUCommandEncoder encoder = - wgpuDeviceCreateCommandEncoder(device, nullptr); - WGPURenderPassEncoder pass = begin_render_pass(encoder, rt.view()); - wgpuRenderPassEncoderSetPipeline(pass, pipeline_final); - wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr); - wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); - wgpuRenderPassEncoderEnd(pass); - WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); - wgpuQueueSubmit(queue, 1, &commands); - wgpuDevicePoll(device, true, nullptr); - - wgpuCommandBufferRelease(commands); - wgpuRenderPassEncoderRelease(pass); - wgpuCommandEncoderRelease(encoder); - wgpuBindGroupRelease(bind_group); + img.normal = load_png_rg(path("normal.png").c_str(), W, H); + img.depth = load_png_depth16(path("depth.png").c_str(), W, H); + img.matid = load_png_gray(path("matid.png").c_str(), W, H, 0.0f); + img.shadow = load_png_gray(path("shadow.png").c_str(), W, H, 1.0f); + img.transp = load_png_gray(path("transp.png").c_str(), W, H, 0.0f); + } else { + printf("Mode: simple (geometry zeroed, normal=(0.5,0.5))\n"); + img.normal.assign(W * H * 2, 0.5f); + img.depth.assign(W * H, 0.0f); + img.matid.assign(W * H, 0.0f); + img.shadow.assign(W * H, 1.0f); + img.transp.assign(W * H, 0.0f); + } - // Read pixels immediately - printf("Reading pixels from GPU...\n"); - std::vector<uint8_t> pixels = rt.read_pixels(); + // --- Pack features --- + std::vector<uint32_t> feat0, feat1; + pack_features(img, feat0, feat1); - // Debug: print first 8 pixels as hex - if (args.debug_hex && !pixels.empty()) { - printf("First 8 pixels (BGRA hex):\n"); - for (int i = 0; i < 8 && i < width * height; ++i) { - const uint8_t b = pixels[i * 4 + 0]; - const uint8_t g = pixels[i * 4 + 1]; - const uint8_t r = pixels[i * 4 + 2]; - const uint8_t a = pixels[i * 4 + 3]; - printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a); - } - } + // --- Create GPU textures --- + WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H); + WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H); + WGPUTexture out_tex = make_output_tex(ctx.device, W, H); - if (pixels.empty()) { - fprintf(stderr, "Error: GPU readback failed\n"); - wgpuTextureViewRelease(intermediate_views[0]); - wgpuTextureViewRelease(intermediate_views[1]); - wgpuTextureRelease(intermediate_textures[0]); - wgpuTextureRelease(intermediate_textures[1]); - wgpuTextureViewRelease(input_view); - wgpuTextureRelease(input_texture); - wgpuBufferRelease(layer_params_buffer); - wgpuBufferRelease(common_uniform_buffer); - wgpuBindGroupLayoutRelease(bgl); - wgpuRenderPipelineRelease(pipeline_final); - wgpuRenderPipelineRelease(pipeline_intermediate); - SamplerCache::Get().clear(); - fixture.shutdown(); - return 1; - } + WGPUTextureView feat0_view = make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint); + WGPUTextureView feat1_view = make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint); + WGPUTextureView out_view = make_view(out_tex, WGPUTextureFormat_RGBA16Float); - // Save output - bool success; - if (args.output_png) { - printf("Saving PNG to '%s'...\n", args.output_path); - success = save_png(args.output_path, pixels, width, height); - } else { - printf("Saving PPM to '%s'...\n", args.output_path); - success = save_ppm(args.output_path, pixels, width, height); - } + upload_tex(ctx.queue, feat0_tex, feat0.data(), W, H); + upload_tex(ctx.queue, feat1_tex, feat1.data(), W, H); - if (!success) { - wgpuTextureViewRelease(intermediate_views[0]); - wgpuTextureViewRelease(intermediate_views[1]); - wgpuTextureRelease(intermediate_textures[0]); - wgpuTextureRelease(intermediate_textures[1]); - wgpuTextureViewRelease(input_view); - wgpuTextureRelease(input_texture); - wgpuBufferRelease(layer_params_buffer); - wgpuBufferRelease(common_uniform_buffer); - wgpuBindGroupLayoutRelease(bgl); - wgpuRenderPipelineRelease(pipeline_final); - wgpuRenderPipelineRelease(pipeline_intermediate); - SamplerCache::Get().clear(); - fixture.shutdown(); - return 1; - } + // --- Wire CNNv3Effect --- + NodeRegistry registry(ctx.device, W, H); + registry.set_external_view("feat0", feat0_view); + registry.set_external_view("feat1", feat1_view); + registry.set_external_view("cnn_out", out_view); - printf("Done! Output saved to '%s'\n", args.output_path); - break; // Exit loop after final layer - } else { - // Intermediate layers: render to ping-pong textures - WGPUTextureView output_view = intermediate_views[dst_idx]; - WGPUCommandEncoder encoder = - wgpuDeviceCreateCommandEncoder(device, nullptr); - WGPURenderPassEncoder pass = begin_render_pass(encoder, output_view); - wgpuRenderPassEncoderSetPipeline(pass, pipeline_intermediate); - wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr); - wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0); - wgpuRenderPassEncoderEnd(pass); - WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr); - wgpuQueueSubmit(queue, 1, &commands); - wgpuDevicePoll(device, true, nullptr); + CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn_out"}, 0.0f, 1000.0f); + effect.declare_nodes(registry); - wgpuCommandBufferRelease(commands); - wgpuRenderPassEncoderRelease(pass); - wgpuCommandEncoderRelease(encoder); - wgpuBindGroupRelease(bind_group); + // --- Load weights --- + if (args.weights_path) { + std::vector<uint32_t> wdata; + if (!load_weights_bin(args.weights_path, wdata)) return 1; + effect.upload_weights(ctx.queue, wdata.data(), + (uint32_t)(wdata.size() * 4)); + printf("Weights: %s (%zu bytes)\n", args.weights_path, wdata.size() * 4); + } else { + printf("Weights: default (from assets, zero if absent)\n"); + } - // Save intermediate layer if requested - if (args.save_intermediates) { - char layer_path[512]; - snprintf(layer_path, sizeof(layer_path), "%s/layer_%d.png", - args.save_intermediates, layer); - printf("Saving intermediate layer %d to '%s'...\n", layer, layer_path); + // --- Run 5 compute passes --- + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + UniformsSequenceParams params = {}; + params.resolution = {(float)W, (float)H}; + params.aspect_ratio = (float)W / (float)H; + effect.render(enc, params, registry); - // Readback RGBA16Float texture - std::vector<uint8_t> pixels = texture_readback_fp16_to_u8( - device, queue, intermediate_textures[dst_idx], width, height); + WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(ctx.queue, 1, &cmds); + wgpuCommandBufferRelease(cmds); + wgpuCommandEncoderRelease(enc); + wgpuDevicePoll(ctx.device, true, nullptr); - // Debug: print first 8 pixels as hex - if (args.debug_hex && !pixels.empty()) { - printf("Layer %d first 8 pixels (BGRA hex):\n", layer); - for (int i = 0; i < 8 && i < width * height; ++i) { - const uint8_t b = pixels[i * 4 + 0]; - const uint8_t g = pixels[i * 4 + 1]; - const uint8_t r = pixels[i * 4 + 2]; - const uint8_t a = pixels[i * 4 + 3]; - printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a); - } - } + // --- Readback --- + std::vector<float> pixels = readback_rgba16f(ctx.device, ctx.queue, out_tex, W, H); - if (!pixels.empty()) { - save_png(layer_path, pixels, width, height); - } else { - fprintf(stderr, "Warning: failed to read intermediate layer %d\n", - layer); - } - } - } + // --- Save output (crop to original size, already same if no padding) --- + if (!save_png(args.output_path, pixels, W, H)) return 1; + printf("Saved: %s\n", args.output_path); - // Update for next layer: output becomes input - if (layer < NUM_LAYERS - 1) { - // Use this layer's output as next layer's input - current_input = intermediate_views[dst_idx]; - dst_idx = 1 - dst_idx; // Flip ping-pong for next render + if (args.debug_hex) { + printf("First 8 output pixels (RGBA f32 → hex):\n"); + for (int i = 0; i < 8 && i < W * H; ++i) { + float r = pixels[i*4 ], g = pixels[i*4+1]; + float b = pixels[i*4+2], a = pixels[i*4+3]; + int ri = (int)(r*255+.5f), gi = (int)(g*255+.5f); + int bi = (int)(b*255+.5f), ai = (int)(a*255+.5f); + ri = ri<0?0:ri>255?255:ri; gi = gi<0?0:gi>255?255:gi; + bi = bi<0?0:bi>255?255:bi; ai = ai<0?0:ai>255?255:ai; + printf(" [%d] 0x%02X%02X%02X%02X (%.4f %.4f %.4f %.4f)\n", + i, ri, gi, bi, ai, r, g, b, a); } } - // Wait for all GPU work to complete before cleanup - wgpuDevicePoll(device, true, nullptr); - // Cleanup - wgpuTextureViewRelease(intermediate_views[0]); - wgpuTextureViewRelease(intermediate_views[1]); - wgpuTextureRelease(intermediate_textures[0]); - wgpuTextureRelease(intermediate_textures[1]); - wgpuBufferRelease(layer_params_buffer); - wgpuBufferRelease(common_uniform_buffer); - wgpuBindGroupLayoutRelease(bgl); - wgpuRenderPipelineRelease(pipeline_intermediate); - wgpuRenderPipelineRelease(pipeline_final); - wgpuTextureViewRelease(input_view); - wgpuTextureRelease(input_texture); - SamplerCache::Get().clear(); - fixture.shutdown(); + wgpuTextureViewRelease(feat0_view); + wgpuTextureViewRelease(feat1_view); + wgpuTextureViewRelease(out_view); + wgpuTextureRelease(feat0_tex); + wgpuTextureRelease(feat1_tex); + wgpuTextureRelease(out_tex); return 0; } |
