summaryrefslogtreecommitdiff
path: root/tools/cnn_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tools/cnn_test.cc')
-rw-r--r--tools/cnn_test.cc1936
1 files changed, 509 insertions, 1427 deletions
diff --git a/tools/cnn_test.cc b/tools/cnn_test.cc
index e5e2d26..beeef8f 100644
--- a/tools/cnn_test.cc
+++ b/tools/cnn_test.cc
@@ -1,21 +1,16 @@
-// CNN shader testing tool for offline validation
-// Tests trained CNN shaders on input PNG with GPU readback
+// CNN v3 shader testing tool — offline WGSL inference for Python parity checks.
+// Loads an input PNG (or sample directory), packs 20-channel features, runs the
+// CNNv3Effect (5 compute passes), and saves the RGBA16Float output as PNG.
#if defined(STRIP_ALL)
#error "cnn_test requires STRIP_ALL=OFF (tool builds only)"
#endif
-#include "effects/shaders.h"
+#include "cnn_v3_effect.h"
#include "generated/assets.h"
-#include "gpu/bind_group_builder.h"
#include "gpu/gpu.h"
-#include "gpu/pipeline_builder.h"
-#include "gpu/post_process_helper.h"
-#include "gpu/sampler_cache.h"
+#include "gpu/sequence.h"
#include "gpu/shader_composer.h"
-#include "gpu/texture_readback.h"
-#include "platform/platform.h"
-#include "tests/common/offscreen_render_target.h"
#include "tests/common/webgpu_test_fixture.h"
#include "util/asset_manager.h"
#include "util/mini_math.h"
@@ -27,1551 +22,638 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
+#include <string>
#include <vector>
-// CNN v1 structures
-struct CNNv1LayerParams {
- int layer_index;
- float blend_amount;
- float _pad[2];
-};
-static_assert(sizeof(CNNv1LayerParams) == 16);
+// ---------------------------------------------------------------------------
+// F16 / pack helpers (match WGSL pack2x16float / pack4x8unorm)
+// ---------------------------------------------------------------------------
-// Helper to get asset string or empty string
-static const char* SafeGetAsset(AssetId id) {
- const uint8_t* data = GetAsset(id);
- return data ? (const char*)data : "";
+static uint16_t f32_to_f16(float f) {
+ uint32_t b;
+ memcpy(&b, &f, 4);
+ uint32_t sign = (b >> 16) & 0x8000u;
+ int32_t exp = (int32_t)((b >> 23) & 0xFFu) - 127 + 15;
+ uint32_t mant = b & 0x7FFFFFu;
+ if (exp <= 0) return (uint16_t)sign;
+ if (exp >= 31) return (uint16_t)(sign | 0x7C00u);
+ return (uint16_t)(sign | ((uint32_t)exp << 10) | (mant >> 13));
}
-// Command-line arguments
-struct Args {
- const char* input_path = nullptr;
- const char* output_path = nullptr;
- float blend = 1.0f;
- bool output_png = true; // Default to PNG
- const char* save_intermediates = nullptr;
- int num_layers = 3; // Default to 3 layers
- bool debug_hex = false; // Print first 8 pixels as hex
- int cnn_version = 1; // 1=CNNEffect, 2=CNNv2Effect
- const char* weights_path = nullptr; // Optional .bin weights file
- bool cnn_version_explicit =
- false; // Track if --cnn-version was explicitly set
-};
-
-// Parse command-line arguments
-static bool parse_args(int argc, char** argv, Args* args) {
- if (argc < 3) {
- return false;
- }
-
- args->input_path = argv[1];
- args->output_path = argv[2];
-
- for (int i = 3; i < argc; ++i) {
- if (strcmp(argv[i], "--blend") == 0 && i + 1 < argc) {
- args->blend = atof(argv[++i]);
- if (args->blend < 0.0f || args->blend > 1.0f) {
- fprintf(stderr, "Error: blend must be in range [0.0, 1.0]\n");
- return false;
- }
- } else if (strcmp(argv[i], "--format") == 0 && i + 1 < argc) {
- ++i;
- if (strcmp(argv[i], "ppm") == 0) {
- args->output_png = false;
- } else if (strcmp(argv[i], "png") == 0) {
- args->output_png = true;
- } else {
- fprintf(stderr, "Error: unknown format '%s' (use 'png' or 'ppm')\n",
- argv[i]);
- return false;
- }
- } else if (strcmp(argv[i], "--save-intermediates") == 0 && i + 1 < argc) {
- args->save_intermediates = argv[++i];
- } else if (strcmp(argv[i], "--layers") == 0 && i + 1 < argc) {
- args->num_layers = atoi(argv[++i]);
- if (args->num_layers < 1 || args->num_layers > 10) {
- fprintf(stderr, "Error: layers must be in range [1, 10]\n");
- return false;
- }
- } else if (strcmp(argv[i], "--debug-hex") == 0) {
- args->debug_hex = true;
- } else if (strcmp(argv[i], "--cnn-version") == 0 && i + 1 < argc) {
- args->cnn_version = atoi(argv[++i]);
- args->cnn_version_explicit = true;
- if (args->cnn_version < 1 || args->cnn_version > 2) {
- fprintf(stderr, "Error: cnn-version must be 1 or 2\n");
- return false;
- }
- } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) {
- args->weights_path = argv[++i];
- } else if (strcmp(argv[i], "--help") == 0) {
- return false;
- } else {
- fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
- return false;
- }
- }
-
- // Force CNN v2 when --weights is specified
- if (args->weights_path) {
- if (args->cnn_version_explicit && args->cnn_version != 2) {
- fprintf(stderr,
- "WARNING: --cnn-version %d ignored (--weights forces CNN v2)\n",
- args->cnn_version);
- }
- args->cnn_version = 2;
-
- // Warn if --layers was specified (binary file config takes precedence)
- if (args->num_layers != 3) { // 3 is the default
- fprintf(stderr,
- "WARNING: --layers %d ignored (--weights loads layer config from "
- ".bin)\n",
- args->num_layers);
- }
- }
-
- return true;
+// Low 16 bits = a, high 16 bits = b (matches WGSL pack2x16float(vec2f(a,b)))
+static uint32_t pack2x16f(float a, float b) {
+ return (uint32_t)f32_to_f16(a) | ((uint32_t)f32_to_f16(b) << 16);
}
-// Print usage
-static void print_usage(const char* prog) {
- fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog);
- fprintf(stderr, "\nOPTIONS:\n");
- fprintf(stderr,
- " --blend F Final blend amount (0.0-1.0, default: "
- "1.0)\n");
- fprintf(stderr, " --format ppm|png Output format (default: png)\n");
- fprintf(stderr,
- " --layers N Number of CNN layers (1-10, default: 3, "
- "ignored with --weights)\n");
- fprintf(stderr,
- " --save-intermediates DIR Save intermediate layers to directory\n");
- fprintf(stderr,
- " --debug-hex Print first 8 pixels as hex (debug)\n");
- fprintf(stderr,
- " --cnn-version N CNN version: 1 (default) or 2 (ignored "
- "with --weights)\n");
- fprintf(stderr,
- " --weights PATH Load weights from .bin (forces CNN v2, "
- "overrides layer config)\n");
- fprintf(stderr, " --help Show this help\n");
+// RGBA as u8 packed into u32 (matches WGSL pack4x8unorm)
+static uint32_t pack4x8u(float a, float b, float c, float d) {
+ auto u8 = [](float v) -> uint32_t {
+ int i = (int)(v * 255.0f + 0.5f);
+ if (i < 0) i = 0;
+ if (i > 255) i = 255;
+ return (uint32_t)i;
+ };
+ return u8(a) | (u8(b) << 8) | (u8(c) << 16) | (u8(d) << 24);
}
-// Load PNG and upload to GPU texture
-static WGPUTexture load_texture(WGPUDevice device, WGPUQueue queue,
- const char* path, int* out_width,
- int* out_height) {
- int width, height, channels;
- uint8_t* data = stbi_load(path, &width, &height, &channels, 4);
- if (!data) {
- fprintf(stderr, "Error: failed to load image '%s'\n", path);
- return nullptr;
- }
-
- *out_width = width;
- *out_height = height;
+// ---------------------------------------------------------------------------
+// Oct-decode [0,1] → unit normal (matches Python cnn_v3_utils.oct_decode)
+// ---------------------------------------------------------------------------
- // Create texture
- const WGPUTextureDescriptor texture_desc = {
- .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst |
- WGPUTextureUsage_RenderAttachment,
- .dimension = WGPUTextureDimension_2D,
- .size = {(uint32_t)(width), (uint32_t)(height), 1},
- .format = WGPUTextureFormat_BGRA8Unorm,
- .mipLevelCount = 1,
- .sampleCount = 1,
- };
- WGPUTexture texture = wgpuDeviceCreateTexture(device, &texture_desc);
- if (!texture) {
- fprintf(stderr, "Error: failed to create texture\n");
- stbi_image_free(data);
- return nullptr;
+static void oct_decode_01(float nx01, float ny01,
+ float* out_x, float* out_y, float* out_z) {
+ float fx = nx01 * 2.0f - 1.0f;
+ float fy = ny01 * 2.0f - 1.0f;
+ float fz = 1.0f - fabsf(fx) - fabsf(fy);
+ if (fz < 0.0f) {
+ float sx = fx >= 0.0f ? 1.0f : -1.0f;
+ float sy = fy >= 0.0f ? 1.0f : -1.0f;
+ fx = (1.0f - fabsf(fy)) * sx;
+ fy = (1.0f - fabsf(fx)) * sy;
}
+ float len = sqrtf(fx*fx + fy*fy + fz*fz);
+ if (len < 1e-8f) len = 1e-8f;
+ *out_x = fx / len;
+ *out_y = fy / len;
+ *out_z = fz / len;
+}
- // Convert RGBA → BGRA
- std::vector<uint8_t> bgra_data(width * height * 4);
- for (int i = 0; i < width * height; ++i) {
- bgra_data[i * 4 + 0] = data[i * 4 + 2]; // B
- bgra_data[i * 4 + 1] = data[i * 4 + 1]; // G
- bgra_data[i * 4 + 2] = data[i * 4 + 0]; // R
- bgra_data[i * 4 + 3] = data[i * 4 + 3]; // A
- }
+// ---------------------------------------------------------------------------
+// Mip helpers — matching Python pyrdown + nearest-upsample
+// ---------------------------------------------------------------------------
- // Upload to GPU
- const WGPUTexelCopyTextureInfo dst = {.texture = texture, .mipLevel = 0};
- const WGPUTexelCopyBufferLayout layout = {
- .bytesPerRow = (uint32_t)(width * 4), .rowsPerImage = (uint32_t)(height)};
- const WGPUExtent3D size = {(uint32_t)(width), (uint32_t)(height), 1};
- wgpuQueueWriteTexture(queue, &dst, bgra_data.data(), bgra_data.size(),
- &layout, &size);
+// Compute mip1 and mip2 for each pixel using the Python convention:
+// mip1_small[y2][x2] = avg(rgb[2y2..2y2+1][2x2..2x2+1]) (half-res)
+// mip2_small[y4][x4] = avg(mip1[2y4..2y4+1][2x4..2x4+1]) (quarter-res)
+// Nearest upsample: mip1[y][x] = mip1_small[y/2][x/2], etc.
+// Output: mip1_out and mip2_out are (H*W*3) float arrays in row-major order.
- stbi_image_free(data);
- return texture;
-}
+static void compute_mips(const float* rgb, int w, int h,
+ std::vector<float>& mip1_out,
+ std::vector<float>& mip2_out) {
+ const int w2 = w / 2, h2 = h / 2;
+ const int w4 = w / 4, h4 = h / 4;
-// Load PNG alpha channel as depth texture (or 1.0 if no alpha)
-static WGPUTexture load_depth_from_alpha(WGPUDevice device, WGPUQueue queue,
- const char* path, int width,
- int height) {
- int w, h, channels;
- uint8_t* data = stbi_load(path, &w, &h, &channels, 4);
- if (!data || w != width || h != height) {
- fprintf(stderr, "Error: failed to load depth from '%s'\n", path);
- if (data)
- stbi_image_free(data);
- return nullptr;
+ std::vector<float> m1(w2 * h2 * 3);
+ for (int y2 = 0; y2 < h2; ++y2) {
+ for (int x2 = 0; x2 < w2; ++x2) {
+ for (int c = 0; c < 3; ++c) {
+ int y0 = y2 * 2, x0 = x2 * 2;
+ float v = rgb[(y0 * w + x0 ) * 3 + c]
+ + rgb[(y0 * w + x0+1) * 3 + c]
+ + rgb[((y0+1) * w + x0 ) * 3 + c]
+ + rgb[((y0+1) * w + x0+1) * 3 + c];
+ m1[(y2 * w2 + x2) * 3 + c] = v * 0.25f;
+ }
+ }
}
- // Extract alpha channel (or use 1.0 if original was RGB)
- std::vector<float> depth_data(width * height);
- bool has_alpha = (channels == 4);
- for (int i = 0; i < width * height; ++i) {
- // Alpha is in data[i*4+3] (0-255), convert to float [0, 1]
- // If no alpha channel, default to 1.0 (far plane)
- depth_data[i] = has_alpha ? (data[i * 4 + 3] / 255.0f) : 1.0f;
+ std::vector<float> m2(w4 * h4 * 3);
+ for (int y4 = 0; y4 < h4; ++y4) {
+ for (int x4 = 0; x4 < w4; ++x4) {
+ for (int c = 0; c < 3; ++c) {
+ int y0 = y4 * 2, x0 = x4 * 2;
+ float v = m1[(y0 * w2 + x0 ) * 3 + c]
+ + m1[(y0 * w2 + x0+1) * 3 + c]
+ + m1[((y0+1) * w2 + x0 ) * 3 + c]
+ + m1[((y0+1) * w2 + x0+1) * 3 + c];
+ m2[(y4 * w4 + x4) * 3 + c] = v * 0.25f;
+ }
+ }
}
- stbi_image_free(data);
- // Create R32Float depth texture
- const WGPUTextureDescriptor depth_desc = {
- .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst,
- .dimension = WGPUTextureDimension_2D,
- .size = {(uint32_t)(width), (uint32_t)(height), 1},
- .format = WGPUTextureFormat_R32Float,
- .mipLevelCount = 1,
- .sampleCount = 1,
- };
- WGPUTexture depth_texture = wgpuDeviceCreateTexture(device, &depth_desc);
- if (!depth_texture) {
- fprintf(stderr, "Error: failed to create depth texture\n");
- return nullptr;
+ // Nearest upsample to full-res
+ mip1_out.resize(w * h * 3);
+ mip2_out.resize(w * h * 3);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int i = (y * w + x) * 3;
+ int i1 = ((y/2) * w2 + (x/2)) * 3;
+ int i2 = ((y/4) * w4 + (x/4)) * 3;
+ mip1_out[i ] = (y/2 < h2 && x/2 < w2) ? m1[i1 ] : 0.0f;
+ mip1_out[i+1] = (y/2 < h2 && x/2 < w2) ? m1[i1+1] : 0.0f;
+ mip1_out[i+2] = (y/2 < h2 && x/2 < w2) ? m1[i1+2] : 0.0f;
+ mip2_out[i ] = (y/4 < h4 && x/4 < w4) ? m2[i2 ] : 0.0f;
+ mip2_out[i+1] = (y/4 < h4 && x/4 < w4) ? m2[i2+1] : 0.0f;
+ mip2_out[i+2] = (y/4 < h4 && x/4 < w4) ? m2[i2+2] : 0.0f;
+ }
}
-
- // Write depth data
- const WGPUTexelCopyTextureInfo dst = {.texture = depth_texture,
- .mipLevel = 0};
- const WGPUTexelCopyBufferLayout layout = {
- .bytesPerRow = (uint32_t)(width * sizeof(float)),
- .rowsPerImage = (uint32_t)(height)};
- const WGPUExtent3D size = {(uint32_t)(width), (uint32_t)(height), 1};
- wgpuQueueWriteTexture(queue, &dst, depth_data.data(),
- depth_data.size() * sizeof(float), &layout, &size);
-
- printf("Loaded depth from alpha: %dx%d (%s alpha)\n", width, height,
- has_alpha ? "has" : "no");
-
- return depth_texture;
}
-// Create CNN render pipeline (5 bindings)
-// Takes both intermediate format (RGBA16Float) and final format (BGRA8Unorm)
-static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
- WGPUTextureFormat format,
- bool is_final_layer) {
- const char* shader_code = SafeGetAsset(AssetId::ASSET_SHADER_CNN_LAYER);
-
- // Debug: check if shader loaded
- if (!shader_code || shader_code[0] == '\0') {
- fprintf(stderr, "ERROR: CNN shader asset not loaded!\n");
- return nullptr;
- }
- printf("Loaded CNN shader: %zu bytes\n", strlen(shader_code));
-
- WGPUBindGroupLayout bgl =
- BindGroupLayoutBuilder()
- .sampler(0, WGPUShaderStage_Fragment)
- .texture(1, WGPUShaderStage_Fragment)
- .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment)
- .uniform(3, WGPUShaderStage_Fragment)
- .texture(4, WGPUShaderStage_Fragment) // Original input
- .build(device);
+// ---------------------------------------------------------------------------
+// Feature packing: RGB float arrays → feat_tex0 / feat_tex1 (rgba32uint)
+//
+// feat_tex0 (4 u32, f16 pairs — matches load_feat in cnn_v3_enc0.wgsl):
+// [0] albedo.r | albedo.g
+// [1] albedo.b | normal.x (oct, [0,1] — training format)
+// [2] normal.y | depth
+// [3] dzdx | dzdy
+//
+// feat_tex1 (4 u32, u8norm — channel order from cnn_v3_enc0.wgsl load_feat):
+// [0] mat_id | prev.r | prev.g | prev.b
+// [1] mip1.r | mip1.g | mip1.b | mip2.r
+// [2] mip2.g | mip2.b | dif | transp
+// [3] 0
+//
+// Note: normal.xy stored in [0,1] (training format), NOT remapped to [-1,1]
+// like gbuf_pack.wgsl does at runtime. This matches infer_cnn_v3.py.
+// ---------------------------------------------------------------------------
- // Use appropriate format: RGBA16Float for intermediate, BGRA8Unorm for final
- WGPUTextureFormat output_format = is_final_layer
- ? WGPUTextureFormat_BGRA8Unorm
- : WGPUTextureFormat_RGBA16Float;
+struct FeatureImages {
+ int w, h;
+ std::vector<float> albedo; // w*h*3 [0,1]
+ std::vector<float> normal; // w*h*2 [0,1] oct-encoded
+ std::vector<float> depth; // w*h [0,1]
+ std::vector<float> matid; // w*h [0,1]
+ std::vector<float> shadow; // w*h [0,1]
+ std::vector<float> transp; // w*h [0,1]
+};
- WGPURenderPipeline pipeline =
- RenderPipelineBuilder(device)
- .shader(shader_code) // compose=true by default
- .bind_group_layout(bgl)
- .format(output_format)
- .build();
+static void pack_features(const FeatureImages& img,
+ std::vector<uint32_t>& feat0, // w*h*4 u32
+ std::vector<uint32_t>& feat1) // w*h*4 u32
+{
+ const int W = img.w, H = img.h;
+ feat0.resize(W * H * 4);
+ feat1.resize(W * H * 4);
- wgpuBindGroupLayoutRelease(bgl);
- return pipeline;
-}
+ std::vector<float> mip1, mip2;
+ compute_mips(img.albedo.data(), W, H, mip1, mip2);
-// Begin render pass with clear
-static WGPURenderPassEncoder begin_render_pass(WGPUCommandEncoder encoder,
- WGPUTextureView view) {
- const WGPURenderPassColorAttachment color_attachment = {
- .view = view,
- .depthSlice = WGPU_DEPTH_SLICE_UNDEFINED,
- .loadOp = WGPULoadOp_Clear,
- .storeOp = WGPUStoreOp_Store,
- .clearValue = {0.0f, 0.0f, 0.0f, 1.0f},
- };
-
- const WGPURenderPassDescriptor pass_desc = {
- .colorAttachmentCount = 1,
- .colorAttachments = &color_attachment,
- };
+ static const float KEY_X = 0.408f, KEY_Y = 0.816f, KEY_Z = 0.408f;
- return wgpuCommandEncoderBeginRenderPass(encoder, &pass_desc);
-}
+ for (int y = 0; y < H; ++y) {
+ for (int x = 0; x < W; ++x) {
+ const int pi = y * W + x;
+ const int i3 = pi * 3;
+ const int i4 = pi * 4;
-// Save PNG output
-static bool save_png(const char* path, const std::vector<uint8_t>& pixels,
- int width, int height) {
- // Convert BGRA → RGBA
- std::vector<uint8_t> rgba(width * height * 4);
- for (int i = 0; i < width * height; ++i) {
- rgba[i * 4 + 0] = pixels[i * 4 + 2]; // R
- rgba[i * 4 + 1] = pixels[i * 4 + 1]; // G
- rgba[i * 4 + 2] = pixels[i * 4 + 0]; // B
- rgba[i * 4 + 3] = pixels[i * 4 + 3]; // A
- }
+ float ar = img.albedo[i3 ];
+ float ag = img.albedo[i3+1];
+ float ab = img.albedo[i3+2];
- if (!stbi_write_png(path, width, height, 4, rgba.data(), width * 4)) {
- fprintf(stderr, "Error: failed to write PNG '%s'\n", path);
- return false;
- }
+ float nx = img.normal[pi * 2 ]; // [0,1]
+ float ny = img.normal[pi * 2 + 1]; // [0,1]
- return true;
-}
+ float d = img.depth[pi];
-// Create horizontal grayscale composite of layer outputs
-// Each layer is already 4x wide (showing 4 channels), stack them vertically
-static bool save_layer_composite(const char* dir, int width, int height,
- int num_layers) {
- // Each layer PNG is already 4x wide with 4 channels side-by-side
- int layer_width = width * 4;
+ // Central finite difference depth gradient
+ int xm = (x > 0) ? x-1 : 0;
+ int xp = (x < W-1) ? x+1 : W-1;
+ int ym = (y > 0) ? y-1 : 0;
+ int yp = (y < H-1) ? y+1 : H-1;
+ float dzdx = (img.depth[y * W + xp] - img.depth[y * W + xm]) * 0.5f;
+ float dzdy = (img.depth[yp * W + x ] - img.depth[ym * W + x ]) * 0.5f;
- // Load all layer images (they're already grayscale)
- std::vector<std::vector<uint8_t>> layers(num_layers);
- for (int i = 0; i < num_layers; ++i) {
- char path[512];
- snprintf(path, sizeof(path), "%s/layer_%d.png", dir, i);
+ float mat = img.matid[pi];
+ float shad = img.shadow[pi];
+ float trp = img.transp[pi];
- int w, h, channels;
- uint8_t* data = stbi_load(path, &w, &h, &channels, 1); // Load as grayscale
- if (!data || w != layer_width || h != height) {
- if (data)
- stbi_image_free(data);
- fprintf(stderr,
- "Warning: failed to load layer %d for composite (expected %dx%d, "
- "got %dx%d)\n",
- i, layer_width, height, w, h);
- return false;
- }
+ // Diffuse = max(0, dot(oct_decode(normal), KEY_LIGHT)) * shadow
+ float n3x, n3y, n3z;
+ oct_decode_01(nx, ny, &n3x, &n3y, &n3z);
+ float dif = fmaxf(0.0f, n3x*KEY_X + n3y*KEY_Y + n3z*KEY_Z) * shad;
- layers[i].assign(data, data + (layer_width * height));
- stbi_image_free(data);
- }
+ float m1r = mip1[i3 ], m1g = mip1[i3+1], m1b = mip1[i3+2];
+ float m2r = mip2[i3 ], m2g = mip2[i3+1], m2b = mip2[i3+2];
- // Stack layers vertically
- int composite_height = height * num_layers;
- std::vector<uint8_t> composite(layer_width * composite_height);
+ // prev.rgb = 0 (no temporal history)
+ feat0[i4 ] = pack2x16f(ar, ag);
+ feat0[i4+1] = pack2x16f(ab, nx);
+ feat0[i4+2] = pack2x16f(ny, d );
+ feat0[i4+3] = pack2x16f(dzdx, dzdy);
- for (int layer = 0; layer < num_layers; ++layer) {
- for (int y = 0; y < height; ++y) {
- int src_row_offset = y * layer_width;
- int dst_row_offset = (layer * height + y) * layer_width;
- memcpy(&composite[dst_row_offset], &layers[layer][src_row_offset],
- layer_width);
+ feat1[i4 ] = pack4x8u(mat, 0.0f, 0.0f, 0.0f); // mat_id, prev.rgb=0
+ feat1[i4+1] = pack4x8u(m1r, m1g, m1b, m2r);
+ feat1[i4+2] = pack4x8u(m2g, m2b, dif, trp);
+ feat1[i4+3] = 0u;
}
}
-
- // Save as grayscale PNG (stacked vertically)
- char composite_path[512];
- snprintf(composite_path, sizeof(composite_path), "%s/layers_composite.png",
- dir);
- if (!stbi_write_png(composite_path, layer_width, composite_height, 1,
- composite.data(), layer_width)) {
- fprintf(stderr, "Error: failed to write composite PNG\n");
- return false;
- }
-
- printf("Saved layer composite to '%s' (%dx%d, 4 layers stacked vertically)\n",
- composite_path, layer_width, composite_height);
- return true;
}
-// Save PPM output (fallback)
-static bool save_ppm(const char* path, const std::vector<uint8_t>& pixels,
- int width, int height) {
- FILE* f = fopen(path, "wb");
- if (!f) {
- fprintf(stderr, "Error: failed to open '%s' for writing\n", path);
- return false;
- }
-
- fprintf(f, "P6\n%d %d\n255\n", width, height);
- for (int i = 0; i < width * height; ++i) {
- const uint8_t rgb[3] = {pixels[i * 4 + 2], // R
- pixels[i * 4 + 1], // G
- pixels[i * 4 + 0]}; // B
- fwrite(rgb, 1, 3, f);
- }
+// ---------------------------------------------------------------------------
+// GPU texture helpers
+// ---------------------------------------------------------------------------
- fclose(f);
- return true;
+static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) {
+ WGPUTextureDescriptor d = {};
+ d.format = WGPUTextureFormat_RGBA32Uint;
+ d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst;
+ d.dimension = WGPUTextureDimension_2D;
+ d.size = {(uint32_t)W, (uint32_t)H, 1};
+ d.mipLevelCount = 1;
+ d.sampleCount = 1;
+ return wgpuDeviceCreateTexture(dev, &d);
}
-// CNN v2 structures (matching CNNv2Effect)
-struct CNNv2LayerInfo {
- uint32_t kernel_size;
- uint32_t in_channels;
- uint32_t out_channels;
- uint32_t weight_offset;
- uint32_t weight_count;
-};
-
-struct CNNv2LayerParams {
- uint32_t kernel_size;
- uint32_t in_channels;
- uint32_t out_channels;
- uint32_t weight_offset;
- uint32_t is_output_layer;
- float blend_amount;
- uint32_t is_layer_0;
-};
-
-struct CNNv2StaticFeatureParams {
- uint32_t mip_level;
- uint32_t padding[3];
-};
-
-// Convert RGBA32Uint (packed f16) texture to BGRA8Unorm
-static std::vector<uint8_t>
-readback_rgba32uint_to_bgra8(WGPUDevice device, WGPUQueue queue,
- WGPUTexture texture, int width, int height) {
- // Create staging buffer
- const uint32_t bytes_per_row = width * 16; // 4×u32 per pixel
- const uint32_t padded_bytes_per_row = (bytes_per_row + 255) & ~255;
- const size_t buffer_size = padded_bytes_per_row * height;
-
- WGPUBufferDescriptor buffer_desc = {};
- buffer_desc.size = buffer_size;
- buffer_desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
- buffer_desc.mappedAtCreation = false;
+static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) {
+ WGPUTextureDescriptor d = {};
+ d.format = WGPUTextureFormat_RGBA16Float;
+ d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc;
+ d.dimension = WGPUTextureDimension_2D;
+ d.size = {(uint32_t)W, (uint32_t)H, 1};
+ d.mipLevelCount = 1;
+ d.sampleCount = 1;
+ return wgpuDeviceCreateTexture(dev, &d);
+}
- WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) {
+ WGPUTextureViewDescriptor d = {};
+ d.format = fmt;
+ d.dimension = WGPUTextureViewDimension_2D;
+ d.mipLevelCount = 1;
+ d.arrayLayerCount = 1;
+ return wgpuTextureCreateView(tex, &d);
+}
- // Copy texture to buffer
- WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+static void upload_tex(WGPUQueue queue, WGPUTexture tex,
+ const uint32_t* data, int W, int H) {
+ WGPUTexelCopyTextureInfo dst = {};
+ dst.texture = tex;
+ WGPUTexelCopyBufferLayout layout = {};
+ layout.bytesPerRow = (uint32_t)(W * 16);
+ layout.rowsPerImage = (uint32_t)H;
+ WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1};
+ wgpuQueueWriteTexture(queue, &dst, data, (size_t)(W * H * 16), &layout, &ext);
+}
- WGPUTexelCopyTextureInfo src = {};
- src.texture = texture;
- src.mipLevel = 0;
+// ---------------------------------------------------------------------------
+// RGBA16Float readback
+// ---------------------------------------------------------------------------
- WGPUTexelCopyBufferInfo dst = {};
- dst.buffer = staging;
- dst.layout.bytesPerRow = padded_bytes_per_row;
- dst.layout.rowsPerImage = height;
+static uint16_t fp16_bits_to_f16(float f) { return f32_to_f16(f); }
+static float fp16_bits_to_f32(uint16_t h) {
+ uint32_t sign = (uint32_t)(h & 0x8000u) << 16;
+ uint32_t exp = (h & 0x7C00u) >> 10;
+ uint32_t mant = h & 0x03FFu;
+ if (exp == 0 && mant == 0) { float r; memcpy(&r, &sign, 4); return r; }
+ if (exp == 31) { uint32_t b = sign | 0x7F800000u | (mant << 13);
+ float r; memcpy(&r, &b, 4); return r; }
+ uint32_t b = sign | ((exp + 112u) << 23) | (mant << 13);
+ float r; memcpy(&r, &b, 4); return r;
+}
- WGPUExtent3D copy_size = {(uint32_t)(width), (uint32_t)(height), 1};
+struct MapState { bool done = false; WGPUMapAsyncStatus status = {}; };
- wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+static std::vector<float> readback_rgba16f(WGPUDevice device, WGPUQueue queue,
+ WGPUTexture tex, int W, int H) {
+ const uint32_t bytes_per_px = 8;
+ const uint32_t raw_bpr = (uint32_t)(W * bytes_per_px);
+ const uint32_t aligned_bpr = ((raw_bpr + 255u) / 256u) * 256u;
+ const size_t buf_size = (size_t)aligned_bpr * (size_t)H;
- WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
- wgpuQueueSubmit(queue, 1, &commands);
- wgpuCommandBufferRelease(commands);
- wgpuCommandEncoderRelease(encoder);
+ WGPUBufferDescriptor bd = {};
+ bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+ bd.size = buf_size;
+ WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd);
- // Wait for copy to complete
+ WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr);
+ WGPUTexelCopyTextureInfo src = {}; src.texture = tex;
+ WGPUTexelCopyBufferInfo dst = {};
+ dst.buffer = staging;
+ dst.layout.bytesPerRow = aligned_bpr;
+ dst.layout.rowsPerImage = (uint32_t)H;
+ WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1};
+ wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &ext);
+ WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+ wgpuQueueSubmit(queue, 1, &cmds);
+ wgpuCommandBufferRelease(cmds);
+ wgpuCommandEncoderRelease(enc);
wgpuDevicePoll(device, true, nullptr);
- // Map and read buffer
- struct MapState {
- bool done = false;
+ MapState ms = {};
+ WGPUBufferMapCallbackInfo mi = {};
+ mi.mode = WGPUCallbackMode_AllowProcessEvents;
+ mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) {
+ auto* st = (MapState*)u; st->status = s; st->done = true;
};
- MapState map_state;
-
- auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
- void* userdata1, void* userdata2) {
- (void)message;
- (void)userdata2;
- MapState* state = (MapState*)userdata1;
- state->done = (status == WGPUMapAsyncStatus_Success);
- };
-
- WGPUBufferMapCallbackInfo map_info = {};
- map_info.mode = WGPUCallbackMode_AllowProcessEvents;
- map_info.callback = map_cb;
- map_info.userdata1 = &map_state;
-
- wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
-
- // Wait for mapping to complete
- for (int i = 0; i < 100 && !map_state.done; ++i) {
+ mi.userdata1 = &ms;
+ wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi);
+ for (int i = 0; i < 200 && !ms.done; ++i)
wgpuDevicePoll(device, true, nullptr);
- }
-
- if (!map_state.done) {
- fprintf(stderr, "Error: Buffer mapping timed out\n");
- wgpuBufferRelease(staging);
- return std::vector<uint8_t>();
- }
-
- const uint32_t* mapped =
- (const uint32_t*)wgpuBufferGetConstMappedRange(staging, 0, buffer_size);
-
- std::vector<uint8_t> result(width * height * 4);
- // Unpack f16 to u8 (BGRA)
- for (int y = 0; y < height; ++y) {
- const uint32_t* row =
- (const uint32_t*)((const uint8_t*)mapped + y * padded_bytes_per_row);
- for (int x = 0; x < width; ++x) {
- // Read 4×u32 (8×f16)
- uint32_t data[4];
- data[0] = row[x * 4 + 0];
- data[1] = row[x * 4 + 1];
- data[2] = row[x * 4 + 2];
- data[3] = row[x * 4 + 3];
-
- // Extract RGBA channels (first 4 f16 values)
- uint16_t r16 = data[0] & 0xFFFF;
- uint16_t g16 = (data[0] >> 16) & 0xFFFF;
- uint16_t b16 = data[1] & 0xFFFF;
- uint16_t a16 = (data[1] >> 16) & 0xFFFF;
-
- // Convert f16 to f32 (simple decode)
- auto f16_to_f32 = [](uint16_t h) -> float {
- uint32_t sign = (h >> 15) & 1;
- uint32_t exp = (h >> 10) & 0x1F;
- uint32_t frac = h & 0x3FF;
-
- if (exp == 0) {
- if (frac == 0)
- return sign ? -0.0f : 0.0f;
- // Denormal
- float val = frac / 1024.0f / 16384.0f;
- return sign ? -val : val;
+ std::vector<float> pixels(W * H * 4, 0.0f);
+ if (ms.done && ms.status == WGPUMapAsyncStatus_Success) {
+ const uint8_t* mapped = (const uint8_t*)
+ wgpuBufferGetConstMappedRange(staging, 0, buf_size);
+ if (mapped) {
+ for (int y = 0; y < H; ++y) {
+ const uint16_t* row = (const uint16_t*)(mapped + (size_t)y * aligned_bpr);
+ for (int x = 0; x < W; ++x) {
+ for (int c = 0; c < 4; ++c)
+ pixels[(y * W + x) * 4 + c] = fp16_bits_to_f32(row[x * 4 + c]);
}
- if (exp == 31) {
- return frac ? NAN : (sign ? -INFINITY : INFINITY);
- }
-
- int32_t e = exp - 15;
- float val = (1.0f + frac / 1024.0f) * powf(2.0f, e);
- return sign ? -val : val;
- };
-
- float r = f16_to_f32(r16);
- float g = f16_to_f32(g16);
- float b = f16_to_f32(b16);
- float a = f16_to_f32(a16);
-
- // Clamp to [0,1] and convert to u8
- auto clamp_u8 = [](float v) -> uint8_t {
- if (v <= 0.0f)
- return 0;
- if (v >= 1.0f)
- return 255;
- return (uint8_t)(v * 255.0f + 0.5f);
- };
-
- result[(y * width + x) * 4 + 0] = clamp_u8(b);
- result[(y * width + x) * 4 + 1] = clamp_u8(g);
- result[(y * width + x) * 4 + 2] = clamp_u8(r);
- result[(y * width + x) * 4 + 3] = clamp_u8(a);
+ }
}
}
-
wgpuBufferUnmap(staging);
wgpuBufferRelease(staging);
-
- return result;
+ return pixels;
}
-// Read RGBA32Uint and create 4x wide grayscale composite (each channel
-// side-by-side)
-static std::vector<uint8_t>
-readback_rgba32uint_to_composite(WGPUDevice device, WGPUQueue queue,
- WGPUTexture texture, int width, int height) {
- // First get BGRA8 data
- std::vector<uint8_t> bgra =
- readback_rgba32uint_to_bgra8(device, queue, texture, width, height);
- if (bgra.empty())
- return {};
-
- // Create 4x wide grayscale image (one channel per horizontal strip)
- int composite_width = width * 4;
- std::vector<uint8_t> composite(composite_width * height);
-
- for (int y = 0; y < height; ++y) {
- for (int x = 0; x < width; ++x) {
- int src_idx = (y * width + x) * 4;
- uint8_t b = bgra[src_idx + 0];
- uint8_t g = bgra[src_idx + 1];
- uint8_t r = bgra[src_idx + 2];
- uint8_t a = bgra[src_idx + 3];
+// ---------------------------------------------------------------------------
+// Image I/O helpers
+// ---------------------------------------------------------------------------
- // Convert each channel to grayscale luminance
- auto to_gray = [](uint8_t val) -> uint8_t { return val; };
-
- // Place each channel in its horizontal strip
- composite[y * composite_width + (0 * width + x)] =
- to_gray(r); // Channel 0
- composite[y * composite_width + (1 * width + x)] =
- to_gray(g); // Channel 1
- composite[y * composite_width + (2 * width + x)] =
- to_gray(b); // Channel 2
- composite[y * composite_width + (3 * width + x)] =
- to_gray(a); // Channel 3
- }
+static std::vector<float> load_png_rgb(const char* path, int* out_w, int* out_h) {
+ int w, h, ch;
+ uint8_t* data = stbi_load(path, &w, &h, &ch, 3);
+ if (!data) {
+ fprintf(stderr, "Error: cannot load '%s'\n", path);
+ return {};
}
-
- return composite;
+ *out_w = w; *out_h = h;
+ std::vector<float> out(w * h * 3);
+ for (int i = 0; i < w * h * 3; ++i)
+ out[i] = data[i] / 255.0f;
+ stbi_image_free(data);
+ return out;
}
-// Process image with CNN v2
-static bool process_cnn_v2(WGPUDevice device, WGPUQueue queue,
- WGPUInstance instance, WGPUTexture input_texture,
- int width, int height, const Args& args) {
- printf("Using CNN v2 (storage buffer architecture)\n");
-
- // Load weights (from file or asset system)
- size_t weights_size = 0;
- const uint8_t* weights_data = nullptr;
- std::vector<uint8_t> file_weights; // For file-based loading
-
- if (args.weights_path) {
- // Load from file
- printf("Loading weights from '%s'...\n", args.weights_path);
- FILE* f = fopen(args.weights_path, "rb");
- if (!f) {
- fprintf(stderr, "Error: failed to open weights file '%s'\n",
- args.weights_path);
- return false;
- }
-
- fseek(f, 0, SEEK_END);
- weights_size = ftell(f);
- fseek(f, 0, SEEK_SET);
-
- file_weights.resize(weights_size);
- size_t read = fread(file_weights.data(), 1, weights_size, f);
- fclose(f);
-
- if (read != weights_size) {
- fprintf(stderr, "Error: failed to read weights file\n");
- return false;
- }
-
- weights_data = file_weights.data();
- } else {
- // Load from asset system
- weights_data =
- (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size);
+// Load 2-channel (RG) from RGB PNG — takes first 2 channels
+static std::vector<float> load_png_rg(const char* path, int ew, int eh) {
+ int w, h, ch;
+ uint8_t* data = stbi_load(path, &w, &h, &ch, 3);
+ if (!data || w != ew || h != eh) {
+ if (data) stbi_image_free(data);
+ fprintf(stderr, "Warning: cannot load normal '%s' — using (0.5,0.5)\n", path);
+ std::vector<float> def(ew * eh * 2, 0.5f);
+ return def;
}
-
- if (!weights_data || weights_size < 20) {
- fprintf(stderr, "Error: CNN v2 weights not available\n");
- return false;
+ std::vector<float> out(w * h * 2);
+ for (int i = 0; i < w * h; ++i) {
+ out[i * 2 ] = data[i * 3 ] / 255.0f;
+ out[i * 2 + 1] = data[i * 3 + 1] / 255.0f;
}
+ stbi_image_free(data);
+ return out;
+}
- // Parse header
- const uint32_t* header = (const uint32_t*)weights_data;
- uint32_t magic = header[0];
- uint32_t version = header[1];
- uint32_t num_layers = header[2];
- uint32_t total_weights = header[3];
-
- if (magic != 0x324e4e43) { // 'CNN2'
- fprintf(stderr, "Error: Invalid CNN v2 weights magic\n");
- return false;
+// Load 16-bit greyscale PNG → [0,1]
+static std::vector<float> load_png_depth16(const char* path, int ew, int eh) {
+ int w, h, ch;
+ uint16_t* data = stbi_load_16(path, &w, &h, &ch, 1);
+ if (!data || w != ew || h != eh) {
+ if (data) stbi_image_free(data);
+ fprintf(stderr, "Warning: cannot load depth '%s' — using 0\n", path);
+ return std::vector<float>(ew * eh, 0.0f);
}
+ std::vector<float> out(w * h);
+ for (int i = 0; i < w * h; ++i)
+ out[i] = data[i] / 65535.0f;
+ stbi_image_free(data);
+ return out;
+}
- uint32_t mip_level = 0;
- if (version == 2) {
- mip_level = header[4];
+// Load 8-bit greyscale PNG → [0,1]
+static std::vector<float> load_png_gray(const char* path, int ew, int eh,
+ float default_val = 0.0f) {
+ int w, h, ch;
+ uint8_t* data = stbi_load(path, &w, &h, &ch, 1);
+ if (!data || w != ew || h != eh) {
+ if (data) stbi_image_free(data);
+ return std::vector<float>(ew * eh, default_val);
}
+ std::vector<float> out(w * h);
+ for (int i = 0; i < w * h; ++i)
+ out[i] = data[i] / 255.0f;
+ stbi_image_free(data);
+ return out;
+}
- printf("Loaded CNN v2 weights: %u layers, %u weights, version %u\n",
- num_layers, total_weights, version);
-
- // Parse layer info
- const uint32_t header_u32_count = (version == 1) ? 4 : 5;
- const uint32_t* layer_data = header + header_u32_count;
- std::vector<CNNv2LayerInfo> layer_info;
-
- for (uint32_t i = 0; i < num_layers; ++i) {
- CNNv2LayerInfo info;
- info.kernel_size = layer_data[i * 5 + 0];
- info.in_channels = layer_data[i * 5 + 1];
- info.out_channels = layer_data[i * 5 + 2];
- info.weight_offset = layer_data[i * 5 + 3];
- info.weight_count = layer_data[i * 5 + 4];
- layer_info.push_back(info);
-
- printf(" Layer %u: %ux%u conv, %u→%u channels, %u weights\n", i,
- info.kernel_size, info.kernel_size, info.in_channels,
- info.out_channels, info.weight_count);
+static bool save_png(const char* path, const std::vector<float>& rgba_f32,
+ int w, int h) {
+ std::vector<uint8_t> rgba8(w * h * 4);
+ for (int i = 0; i < w * h * 4; ++i) {
+ int v = (int)(rgba_f32[i] * 255.0f + 0.5f);
+ rgba8[i] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
}
-
- // Create weights storage buffer (skip header + layer info, upload only
- // weights)
- size_t header_size = 20; // 5 u32
- size_t layer_info_size = 20 * layer_info.size(); // 5 u32 per layer
- size_t weights_offset = header_size + layer_info_size;
- size_t weights_only_size = weights_size - weights_offset;
-
- WGPUBufferDescriptor weights_buffer_desc = {};
- weights_buffer_desc.size = weights_only_size;
- weights_buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst;
- weights_buffer_desc.mappedAtCreation = false;
-
- WGPUBuffer weights_buffer =
- wgpuDeviceCreateBuffer(device, &weights_buffer_desc);
- wgpuQueueWriteBuffer(queue, weights_buffer, 0, weights_data + weights_offset,
- weights_only_size);
-
- // Create input view
- WGPUTextureView input_view =
- gpu_create_texture_view_2d(input_texture, WGPUTextureFormat_BGRA8Unorm);
-
- // Create static features texture (RGBA32Uint)
- const WGPUTextureDescriptor static_desc = {
- .usage = WGPUTextureUsage_StorageBinding |
- WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc,
- .dimension = WGPUTextureDimension_2D,
- .size = {(uint32_t)(width), (uint32_t)(height), 1},
- .format = WGPUTextureFormat_RGBA32Uint,
- .mipLevelCount = 1,
- .sampleCount = 1,
- };
- WGPUTexture static_features_tex =
- wgpuDeviceCreateTexture(device, &static_desc);
- WGPUTextureView static_features_view =
- wgpuTextureCreateView(static_features_tex, nullptr);
-
- // Load depth from input alpha channel (or 1.0 if no alpha)
- WGPUTexture depth_texture =
- load_depth_from_alpha(device, queue, args.input_path, width, height);
- if (!depth_texture) {
- wgpuTextureViewRelease(static_features_view);
- wgpuTextureRelease(static_features_tex);
- wgpuBufferRelease(weights_buffer);
- wgpuTextureViewRelease(input_view);
+ if (!stbi_write_png(path, w, h, 4, rgba8.data(), w * 4)) {
+ fprintf(stderr, "Error: failed to write '%s'\n", path);
return false;
}
- WGPUTextureView depth_view = wgpuTextureCreateView(depth_texture, nullptr);
-
- // Create layer textures (ping-pong)
- WGPUTexture layer_textures[2] = {
- wgpuDeviceCreateTexture(device, &static_desc),
- wgpuDeviceCreateTexture(device, &static_desc),
- };
- WGPUTextureView layer_views[2] = {
- wgpuTextureCreateView(layer_textures[0], nullptr),
- wgpuTextureCreateView(layer_textures[1], nullptr),
- };
+ return true;
+}
- // Load shaders
- const char* static_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC);
- const char* layer_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE);
+// ---------------------------------------------------------------------------
+// Weight loading
+// ---------------------------------------------------------------------------
- if (!static_shader[0] || !layer_shader[0]) {
- fprintf(stderr, "Error: CNN v2 shaders not available\n");
- wgpuTextureViewRelease(static_features_view);
- wgpuTextureRelease(static_features_tex);
- wgpuTextureViewRelease(depth_view);
- wgpuTextureRelease(depth_texture);
- wgpuTextureViewRelease(layer_views[0]);
- wgpuTextureViewRelease(layer_views[1]);
- wgpuTextureRelease(layer_textures[0]);
- wgpuTextureRelease(layer_textures[1]);
- wgpuBufferRelease(weights_buffer);
- wgpuTextureViewRelease(input_view);
+static bool load_weights_bin(const char* path, std::vector<uint32_t>& out) {
+ FILE* f = fopen(path, "rb");
+ if (!f) {
+ fprintf(stderr, "Error: cannot open weights '%s'\n", path);
return false;
}
-
- // Create static feature params buffer
- WGPUBufferDescriptor static_params_desc = {};
- static_params_desc.size = sizeof(CNNv2StaticFeatureParams);
- static_params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
- static_params_desc.mappedAtCreation = false;
-
- WGPUBuffer static_params_buffer =
- wgpuDeviceCreateBuffer(device, &static_params_desc);
-
- CNNv2StaticFeatureParams static_params;
- static_params.mip_level = mip_level;
- static_params.padding[0] = 0;
- static_params.padding[1] = 0;
- static_params.padding[2] = 0;
- wgpuQueueWriteBuffer(queue, static_params_buffer, 0, &static_params,
- sizeof(static_params));
-
- // Create linear sampler for bilinear interpolation
- WGPUSamplerDescriptor linear_sampler_desc = {};
- linear_sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge;
- linear_sampler_desc.addressModeV = WGPUAddressMode_ClampToEdge;
- linear_sampler_desc.addressModeW = WGPUAddressMode_ClampToEdge;
- linear_sampler_desc.magFilter = WGPUFilterMode_Linear;
- linear_sampler_desc.minFilter = WGPUFilterMode_Linear;
- linear_sampler_desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
- linear_sampler_desc.lodMinClamp = 0.0f;
- linear_sampler_desc.lodMaxClamp = 32.0f;
- linear_sampler_desc.maxAnisotropy = 1;
-
- WGPUSampler linear_sampler =
- wgpuDeviceCreateSampler(device, &linear_sampler_desc);
-
- // Create static features compute pipeline
- WGPUShaderSourceWGSL static_wgsl = {};
- static_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
- static_wgsl.code = str_view(static_shader);
-
- WGPUShaderModuleDescriptor static_module_desc = {};
- static_module_desc.nextInChain = &static_wgsl.chain;
-
- WGPUShaderModule static_module =
- wgpuDeviceCreateShaderModule(device, &static_module_desc);
-
- // Bind group layout: 0=input, 1=input_mip1, 2=input_mip2, 3=depth, 4=output,
- // 5=params, 6=linear_sampler
- WGPUBindGroupLayoutEntry static_bgl_entries[7] = {};
- static_bgl_entries[0].binding = 0;
- static_bgl_entries[0].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float;
- static_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- static_bgl_entries[1].binding = 1;
- static_bgl_entries[1].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float;
- static_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- static_bgl_entries[2].binding = 2;
- static_bgl_entries[2].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float;
- static_bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- static_bgl_entries[3].binding = 3;
- static_bgl_entries[3].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[3].texture.sampleType =
- WGPUTextureSampleType_UnfilterableFloat;
- static_bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- static_bgl_entries[4].binding = 4;
- static_bgl_entries[4].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[4].storageTexture.access =
- WGPUStorageTextureAccess_WriteOnly;
- static_bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
- static_bgl_entries[4].storageTexture.viewDimension =
- WGPUTextureViewDimension_2D;
-
- static_bgl_entries[5].binding = 5;
- static_bgl_entries[5].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[5].buffer.type = WGPUBufferBindingType_Uniform;
- static_bgl_entries[5].buffer.minBindingSize =
- sizeof(CNNv2StaticFeatureParams);
-
- static_bgl_entries[6].binding = 6;
- static_bgl_entries[6].visibility = WGPUShaderStage_Compute;
- static_bgl_entries[6].sampler.type = WGPUSamplerBindingType_Filtering;
-
- WGPUBindGroupLayoutDescriptor static_bgl_desc = {};
- static_bgl_desc.entryCount = 7;
- static_bgl_desc.entries = static_bgl_entries;
-
- WGPUBindGroupLayout static_bgl =
- wgpuDeviceCreateBindGroupLayout(device, &static_bgl_desc);
-
- WGPUPipelineLayoutDescriptor static_pl_desc = {};
- static_pl_desc.bindGroupLayoutCount = 1;
- static_pl_desc.bindGroupLayouts = &static_bgl;
-
- WGPUPipelineLayout static_pl =
- wgpuDeviceCreatePipelineLayout(device, &static_pl_desc);
-
- WGPUComputePipelineDescriptor static_pipeline_desc = {};
- static_pipeline_desc.compute.module = static_module;
- static_pipeline_desc.compute.entryPoint = str_view("main");
- static_pipeline_desc.layout = static_pl;
-
- WGPUComputePipeline static_pipeline =
- wgpuDeviceCreateComputePipeline(device, &static_pipeline_desc);
-
- wgpuShaderModuleRelease(static_module);
- wgpuPipelineLayoutRelease(static_pl);
-
- // Create static bind group (use input as all mips for simplicity)
- WGPUBindGroupEntry static_bg_entries[7] = {};
- static_bg_entries[0].binding = 0;
- static_bg_entries[0].textureView = input_view;
- static_bg_entries[1].binding = 1;
- static_bg_entries[1].textureView = input_view;
- static_bg_entries[2].binding = 2;
- static_bg_entries[2].textureView = input_view;
- static_bg_entries[3].binding = 3;
- static_bg_entries[3].textureView =
- depth_view; // Depth from alpha channel (matches training)
- static_bg_entries[4].binding = 4;
- static_bg_entries[4].textureView = static_features_view;
- static_bg_entries[5].binding = 5;
- static_bg_entries[5].buffer = static_params_buffer;
- static_bg_entries[5].size = sizeof(CNNv2StaticFeatureParams);
- static_bg_entries[6].binding = 6;
- static_bg_entries[6].sampler = linear_sampler;
-
- WGPUBindGroupDescriptor static_bg_desc = {};
- static_bg_desc.layout = static_bgl;
- static_bg_desc.entryCount = 7;
- static_bg_desc.entries = static_bg_entries;
-
- WGPUBindGroup static_bg = wgpuDeviceCreateBindGroup(device, &static_bg_desc);
-
- wgpuBindGroupLayoutRelease(static_bgl);
-
- // Create layer compute pipeline
- WGPUShaderSourceWGSL layer_wgsl = {};
- layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
- layer_wgsl.code = str_view(layer_shader);
-
- WGPUShaderModuleDescriptor layer_module_desc = {};
- layer_module_desc.nextInChain = &layer_wgsl.chain;
-
- WGPUShaderModule layer_module =
- wgpuDeviceCreateShaderModule(device, &layer_module_desc);
-
- // Layer bind group layout:
- // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params,
- // 5=original
- WGPUBindGroupLayoutEntry layer_bgl_entries[6] = {};
- layer_bgl_entries[0].binding = 0;
- layer_bgl_entries[0].visibility = WGPUShaderStage_Compute;
- layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint;
- layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- layer_bgl_entries[1].binding = 1;
- layer_bgl_entries[1].visibility = WGPUShaderStage_Compute;
- layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint;
- layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- layer_bgl_entries[2].binding = 2;
- layer_bgl_entries[2].visibility = WGPUShaderStage_Compute;
- layer_bgl_entries[2].storageTexture.access =
- WGPUStorageTextureAccess_WriteOnly;
- layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
- layer_bgl_entries[2].storageTexture.viewDimension =
- WGPUTextureViewDimension_2D;
-
- layer_bgl_entries[3].binding = 3;
- layer_bgl_entries[3].visibility = WGPUShaderStage_Compute;
- layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
-
- layer_bgl_entries[4].binding = 4;
- layer_bgl_entries[4].visibility = WGPUShaderStage_Compute;
- layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform;
- layer_bgl_entries[4].buffer.minBindingSize = sizeof(CNNv2LayerParams);
-
- layer_bgl_entries[5].binding = 5;
- layer_bgl_entries[5].visibility = WGPUShaderStage_Compute;
- layer_bgl_entries[5].texture.sampleType = WGPUTextureSampleType_Float;
- layer_bgl_entries[5].texture.viewDimension = WGPUTextureViewDimension_2D;
-
- WGPUBindGroupLayoutDescriptor layer_bgl_desc = {};
- layer_bgl_desc.entryCount = 6;
- layer_bgl_desc.entries = layer_bgl_entries;
-
- WGPUBindGroupLayout layer_bgl =
- wgpuDeviceCreateBindGroupLayout(device, &layer_bgl_desc);
-
- WGPUPipelineLayoutDescriptor layer_pl_desc = {};
- layer_pl_desc.bindGroupLayoutCount = 1;
- layer_pl_desc.bindGroupLayouts = &layer_bgl;
-
- WGPUPipelineLayout layer_pl =
- wgpuDeviceCreatePipelineLayout(device, &layer_pl_desc);
-
- WGPUComputePipelineDescriptor layer_pipeline_desc = {};
- layer_pipeline_desc.compute.module = layer_module;
- layer_pipeline_desc.compute.entryPoint = str_view("main");
- layer_pipeline_desc.layout = layer_pl;
-
- WGPUComputePipeline layer_pipeline =
- wgpuDeviceCreateComputePipeline(device, &layer_pipeline_desc);
-
- wgpuShaderModuleRelease(layer_module);
- wgpuPipelineLayoutRelease(layer_pl);
-
- // Create layer params buffers
- std::vector<WGPUBuffer> layer_params_buffers;
- for (size_t i = 0; i < layer_info.size(); ++i) {
- WGPUBufferDescriptor params_desc = {};
- params_desc.size = sizeof(CNNv2LayerParams);
- params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
- params_desc.mappedAtCreation = false;
-
- WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &params_desc);
- layer_params_buffers.push_back(buf);
- }
-
- // Execute compute passes
- WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
-
- // Pass 1: Static features
- printf("Computing static features...\n");
- WGPUComputePassEncoder static_pass =
- wgpuCommandEncoderBeginComputePass(encoder, nullptr);
- wgpuComputePassEncoderSetPipeline(static_pass, static_pipeline);
- wgpuComputePassEncoderSetBindGroup(static_pass, 0, static_bg, 0, nullptr);
-
- uint32_t workgroups_x = (width + 7) / 8;
- uint32_t workgroups_y = (height + 7) / 8;
- wgpuComputePassEncoderDispatchWorkgroups(static_pass, workgroups_x,
- workgroups_y, 1);
-
- wgpuComputePassEncoderEnd(static_pass);
- wgpuComputePassEncoderRelease(static_pass);
-
- // Save static features if requested
- if (args.save_intermediates) {
- // Submit and wait for static features to complete
- WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr);
- wgpuQueueSubmit(queue, 1, &cmd);
- wgpuCommandBufferRelease(cmd);
- wgpuDevicePoll(device, true, nullptr);
-
- // Create new encoder for layers
- encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
-
- char layer_path[512];
- snprintf(layer_path, sizeof(layer_path), "%s/static_features.png",
- args.save_intermediates);
- printf("Saving static features to '%s'...\n", layer_path);
-
- // Read back RGBA32Uint and create 8-channel grayscale composite
- // Static features has 8 channels (packed as 4×u32), create 8x wide
- // composite
- std::vector<uint8_t> bgra = readback_rgba32uint_to_bgra8(
- device, queue, static_features_tex, width, height);
-
- if (!bgra.empty()) {
- // Static features: 8 f16 values packed in 4×u32
- // For now, just show first 4 channels (like layers)
- // TODO: Show all 8 channels in 8x wide composite
- std::vector<uint8_t> composite = readback_rgba32uint_to_composite(
- device, queue, static_features_tex, width, height);
- if (!composite.empty()) {
- int composite_width = width * 4;
- if (!stbi_write_png(layer_path, composite_width, height, 1,
- composite.data(), composite_width)) {
- fprintf(stderr, "Error: failed to write static features PNG\n");
- }
- }
- }
+ fseek(f, 0, SEEK_END);
+ long sz = ftell(f);
+ rewind(f);
+ if (sz <= 0 || sz % 4 != 0) {
+ fprintf(stderr, "Error: bad weights file size %ld\n", sz);
+ fclose(f);
+ return false;
}
-
- // Pass 2-N: CNN layers
- for (size_t i = 0; i < layer_info.size(); ++i) {
- const CNNv2LayerInfo& info = layer_info[i];
-
- printf("Processing layer %zu/%zu (%ux%u, %u→%u channels)...\n", i + 1,
- layer_info.size(), info.kernel_size, info.kernel_size,
- info.in_channels, info.out_channels);
-
- // Update layer params
- CNNv2LayerParams params;
- params.kernel_size = info.kernel_size;
- params.in_channels = info.in_channels;
- params.out_channels = info.out_channels;
- params.weight_offset = info.weight_offset;
- params.is_output_layer = (i == layer_info.size() - 1) ? 1 : 0;
- params.blend_amount = args.blend;
- params.is_layer_0 = (i == 0) ? 1 : 0;
-
- wgpuQueueWriteBuffer(queue, layer_params_buffers[i], 0, &params,
- sizeof(params));
-
- // Create bind group for this layer
- WGPUBindGroupEntry layer_bg_entries[6] = {};
- layer_bg_entries[0].binding = 0;
- layer_bg_entries[0].textureView = static_features_view;
-
- layer_bg_entries[1].binding = 1;
- layer_bg_entries[1].textureView =
- (i == 0) ? static_features_view : layer_views[i % 2];
-
- layer_bg_entries[2].binding = 2;
- layer_bg_entries[2].textureView = layer_views[(i + 1) % 2];
-
- layer_bg_entries[3].binding = 3;
- layer_bg_entries[3].buffer = weights_buffer;
- layer_bg_entries[3].size = weights_only_size;
-
- layer_bg_entries[4].binding = 4;
- layer_bg_entries[4].buffer = layer_params_buffers[i];
- layer_bg_entries[4].size = sizeof(CNNv2LayerParams);
-
- layer_bg_entries[5].binding = 5;
- layer_bg_entries[5].textureView = input_view;
-
- WGPUBindGroupDescriptor layer_bg_desc = {};
- layer_bg_desc.layout = layer_bgl;
- layer_bg_desc.entryCount = 6;
- layer_bg_desc.entries = layer_bg_entries;
-
- WGPUBindGroup layer_bg = wgpuDeviceCreateBindGroup(device, &layer_bg_desc);
-
- WGPUComputePassEncoder layer_pass =
- wgpuCommandEncoderBeginComputePass(encoder, nullptr);
- wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline);
- wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bg, 0, nullptr);
-
- wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x,
- workgroups_y, 1);
-
- wgpuComputePassEncoderEnd(layer_pass);
- wgpuComputePassEncoderRelease(layer_pass);
- wgpuBindGroupRelease(layer_bg);
-
- // Save intermediate layer if requested
- if (args.save_intermediates) {
- // Submit and wait for layer to complete
- WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr);
- wgpuQueueSubmit(queue, 1, &cmd);
- wgpuCommandBufferRelease(cmd);
- wgpuDevicePoll(device, true, nullptr);
-
- // Create new encoder for next layer
- encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
-
- char layer_path[512];
- snprintf(layer_path, sizeof(layer_path), "%s/layer_%zu.png",
- args.save_intermediates, i);
- printf("Saving intermediate layer %zu to '%s'...\n", i, layer_path);
-
- // Read back RGBA32Uint and create 4-channel grayscale composite
- WGPUTexture output_tex = layer_textures[(i + 1) % 2];
- std::vector<uint8_t> composite = readback_rgba32uint_to_composite(
- device, queue, output_tex, width, height);
-
- if (!composite.empty()) {
- int composite_width = width * 4;
- if (!stbi_write_png(layer_path, composite_width, height, 1,
- composite.data(), composite_width)) {
- fprintf(stderr, "Error: failed to write layer PNG\n");
- }
- }
- }
+ out.resize((size_t)sz / 4);
+ if ((long)fread(out.data(), 4, out.size(), f) != sz / 4) {
+ fprintf(stderr, "Error: read failed for '%s'\n", path);
+ fclose(f);
+ return false;
}
+ fclose(f);
+ return true;
+}
- WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
- wgpuQueueSubmit(queue, 1, &commands);
- wgpuCommandBufferRelease(commands);
- wgpuCommandEncoderRelease(encoder);
-
- wgpuDevicePoll(device, true, nullptr);
-
- // Create layer composite if intermediates were saved
- if (args.save_intermediates) {
- save_layer_composite(args.save_intermediates, width, height,
- layer_info.size());
- }
+// ---------------------------------------------------------------------------
+// Args
+// ---------------------------------------------------------------------------
- // Readback final result (from last layer's output texture)
- printf("Reading pixels from GPU...\n");
- size_t final_layer_idx = (layer_info.size()) % 2;
- std::vector<uint8_t> pixels = readback_rgba32uint_to_bgra8(
- device, queue, layer_textures[final_layer_idx], width, height);
+struct Args {
+ const char* input_path = nullptr;
+ const char* output_path = nullptr;
+ const char* sample_dir = nullptr;
+ const char* weights_path = nullptr;
+ bool debug_hex = false;
+};
- if (pixels.empty()) {
- fprintf(stderr, "Error: GPU readback failed\n");
- for (auto buf : layer_params_buffers)
- wgpuBufferRelease(buf);
- wgpuComputePipelineRelease(layer_pipeline);
- wgpuBindGroupLayoutRelease(layer_bgl);
- wgpuBindGroupRelease(static_bg);
- wgpuComputePipelineRelease(static_pipeline);
- wgpuBufferRelease(static_params_buffer);
- wgpuTextureViewRelease(static_features_view);
- wgpuTextureRelease(static_features_tex);
- wgpuTextureViewRelease(depth_view);
- wgpuTextureRelease(depth_texture);
- wgpuTextureViewRelease(layer_views[0]);
- wgpuTextureViewRelease(layer_views[1]);
- wgpuTextureRelease(layer_textures[0]);
- wgpuTextureRelease(layer_textures[1]);
- wgpuBufferRelease(weights_buffer);
- wgpuTextureViewRelease(input_view);
- return false;
- }
+static void print_usage(const char* prog) {
+ fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog);
+ fprintf(stderr, "\nOPTIONS:\n");
+ fprintf(stderr, " --sample-dir DIR Full sample dir with albedo/normal/depth/matid/shadow/transp\n");
+ fprintf(stderr, " --weights FILE Load weights from cnn_v3_weights.bin\n");
+ fprintf(stderr, " --debug-hex Print first 8 output pixels as hex\n");
+ fprintf(stderr, " --help Show this help\n");
+ fprintf(stderr, "\nSimple mode (single PNG): geometry channels zeroed, normal=(0.5,0.5).\n");
+ fprintf(stderr, "FiLM is always identity (gamma=1, beta=0).\n");
+ fprintf(stderr, "\nNote: feature packing uses [0,1] oct-normals (training format) to match\n");
+ fprintf(stderr, " infer_cnn_v3.py for direct Python/WGSL comparison.\n");
+}
- // Debug hex dump
- if (args.debug_hex) {
- printf("First 8 pixels (BGRA hex):\n");
- for (int i = 0; i < 8 && i < width * height; ++i) {
- const uint8_t b = pixels[i * 4 + 0];
- const uint8_t g = pixels[i * 4 + 1];
- const uint8_t r = pixels[i * 4 + 2];
- const uint8_t a = pixels[i * 4 + 3];
- printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
+static bool parse_args(int argc, char** argv, Args* args) {
+ if (argc < 3) return false;
+ args->input_path = argv[1];
+ args->output_path = argv[2];
+ for (int i = 3; i < argc; ++i) {
+ if (strcmp(argv[i], "--sample-dir") == 0 && i + 1 < argc) {
+ args->sample_dir = argv[++i];
+ } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) {
+ args->weights_path = argv[++i];
+ } else if (strcmp(argv[i], "--debug-hex") == 0) {
+ args->debug_hex = true;
+ } else if (strcmp(argv[i], "--help") == 0) {
+ return false;
+ } else {
+ fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
+ return false;
}
}
+ return true;
+}
- // Save output
- bool success;
- if (args.output_png) {
- printf("Saving PNG to '%s'...\n", args.output_path);
- success = save_png(args.output_path, pixels, width, height);
- } else {
- printf("Saving PPM to '%s'...\n", args.output_path);
- success = save_ppm(args.output_path, pixels, width, height);
- }
-
- if (success) {
- printf("Done! Output saved to '%s'\n", args.output_path);
- }
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
- // Cleanup
- for (auto buf : layer_params_buffers)
- wgpuBufferRelease(buf);
- wgpuComputePipelineRelease(layer_pipeline);
- wgpuBindGroupLayoutRelease(layer_bgl);
- wgpuBindGroupRelease(static_bg);
- wgpuComputePipelineRelease(static_pipeline);
- wgpuBufferRelease(static_params_buffer);
- wgpuTextureViewRelease(static_features_view);
- wgpuTextureRelease(static_features_tex);
- wgpuTextureViewRelease(layer_views[0]);
- wgpuTextureViewRelease(layer_views[1]);
- wgpuTextureRelease(layer_textures[0]);
- wgpuTextureRelease(layer_textures[1]);
- wgpuBufferRelease(weights_buffer);
- wgpuTextureViewRelease(input_view);
-
- return success;
-}
+extern void InitShaderComposer();
int main(int argc, char** argv) {
- // Parse arguments
Args args;
if (!parse_args(argc, argv, &args)) {
print_usage(argv[0]);
return 1;
}
- // Initialize shader composer (required for #include resolution)
- InitShaderComposer();
-
- // Initialize WebGPU
+ // Init GPU
WebGPUTestFixture fixture;
if (!fixture.init()) {
- fprintf(stderr, "Error: GPU unavailable\n");
+ fprintf(stderr, "Error: WebGPU device unavailable\n");
return 1;
}
+ InitShaderComposer();
GpuContext ctx = fixture.ctx();
- WGPUDevice device = ctx.device;
- WGPUQueue queue = ctx.queue;
- WGPUInstance instance = fixture.instance();
- // Load input texture
- int width, height;
- WGPUTexture input_texture =
- load_texture(device, queue, args.input_path, &width, &height);
- if (!input_texture) {
- SamplerCache::Get().clear();
- fixture.shutdown();
- return 1;
- }
-
- printf("Loaded %dx%d image from '%s'\n", width, height, args.input_path);
+ // --- Load input image ---
+ int W, H;
+ std::vector<float> albedo = load_png_rgb(args.input_path, &W, &H);
+ if (albedo.empty()) return 1;
- // Branch based on CNN version
- if (args.cnn_version == 2) {
- bool success = process_cnn_v2(device, queue, instance, input_texture, width,
- height, args);
- wgpuTextureRelease(input_texture);
- SamplerCache::Get().clear();
- fixture.shutdown();
- return success ? 0 : 1;
+ // Pad to multiples of 4 (U-Net requires 2 pooling levels)
+ const int W4 = (W + 3) & ~3;
+ const int H4 = (H + 3) & ~3;
+ if (W4 != W || H4 != H) {
+ printf("Padding %dx%d → %dx%d\n", W, H, W4, H4);
+ std::vector<float> padded(W4 * H4 * 3, 0.0f);
+ for (int y = 0; y < H; ++y)
+ for (int x = 0; x < W; ++x)
+ for (int c = 0; c < 3; ++c)
+ padded[(y * W4 + x) * 3 + c] = albedo[(y * W + x) * 3 + c];
+ albedo = std::move(padded);
+ W = W4; H = H4;
}
- // CNN v1 processing below
- printf("Using CNN v1 (render pipeline architecture)\n");
+ printf("Input: %s (%dx%d)\n", args.input_path, W, H);
- // Create input texture view
- WGPUTextureView input_view =
- gpu_create_texture_view_2d(input_texture, WGPUTextureFormat_BGRA8Unorm);
- WGPUTextureView original_view = input_view; // Keep reference to original
+ // --- Build FeatureImages ---
+ FeatureImages img;
+ img.w = W; img.h = H;
+ img.albedo = albedo;
- // Create CNN pipelines (different formats for intermediate vs final)
- WGPURenderPipeline pipeline_intermediate =
- create_cnn_pipeline(device, WGPUTextureFormat_RGBA16Float, false);
- WGPURenderPipeline pipeline_final =
- create_cnn_pipeline(device, WGPUTextureFormat_BGRA8Unorm, true);
-
- if (!pipeline_intermediate || !pipeline_final) {
- fprintf(stderr, "Error: failed to create CNN pipelines\n");
- if (pipeline_intermediate)
- wgpuRenderPipelineRelease(pipeline_intermediate);
- if (pipeline_final)
- wgpuRenderPipelineRelease(pipeline_final);
- wgpuTextureViewRelease(input_view);
- wgpuTextureRelease(input_texture);
- SamplerCache::Get().clear();
- fixture.shutdown();
- return 1;
- }
-
- // Get bind group layout from intermediate pipeline (same for both)
- WGPUBindGroupLayout bgl =
- wgpuRenderPipelineGetBindGroupLayout(pipeline_intermediate, 0);
-
- // Create uniform buffers
- const WGPUBufferDescriptor common_uniform_desc = {
- .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
- .size = sizeof(UniformsSequenceParams),
- };
- WGPUBuffer common_uniform_buffer =
- wgpuDeviceCreateBuffer(device, &common_uniform_desc);
-
- const WGPUBufferDescriptor layer_params_desc = {
- .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
- .size = sizeof(CNNv1LayerParams),
- };
- WGPUBuffer layer_params_buffer =
- wgpuDeviceCreateBuffer(device, &layer_params_desc);
-
- // Create intermediate textures for ping-pong (2 textures)
- // Use RGBA16Float to preserve [-1,1] range from tanh activation
- const WGPUTextureDescriptor intermediate_desc = {
- .usage = WGPUTextureUsage_TextureBinding |
- WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_CopySrc,
- .dimension = WGPUTextureDimension_2D,
- .size = {(uint32_t)(width), (uint32_t)(height), 1},
- .format = WGPUTextureFormat_RGBA16Float,
- .mipLevelCount = 1,
- .sampleCount = 1,
- };
-
- WGPUTexture intermediate_textures[2] = {
- wgpuDeviceCreateTexture(device, &intermediate_desc),
- wgpuDeviceCreateTexture(device, &intermediate_desc),
- };
-
- // Create views for intermediate textures (RGBA16Float)
- WGPUTextureView intermediate_views[2] = {
- gpu_create_texture_view_2d(intermediate_textures[0],
- WGPUTextureFormat_RGBA16Float),
- gpu_create_texture_view_2d(intermediate_textures[1],
- WGPUTextureFormat_RGBA16Float),
- };
-
- // Get sampler
- WGPUSampler sampler =
- SamplerCache::Get().get_or_create(device, SamplerCache::clamp());
-
- // Multi-layer processing
- const int NUM_LAYERS = args.num_layers;
- int dst_idx = 0; // Index of texture to render to
-
- // First layer reads from input, subsequent layers read from previous output
- WGPUTextureView current_input = input_view;
-
- for (int layer = 0; layer < NUM_LAYERS; ++layer) {
- printf("Processing layer %d/%d...\n", layer + 1, NUM_LAYERS);
-
- // Update uniforms
- UniformsSequenceParams common_u = {
- .resolution = {(float)(width), (float)(height)},
- .aspect_ratio = (float)(width) / (float)(height),
- .time = 0.0f,
- .beat_time = 0.0f,
- .beat_phase = 0.0f,
- .audio_intensity = 0.0f,
- .noise = 0.0f,
- };
- wgpuQueueWriteBuffer(queue, common_uniform_buffer, 0, &common_u,
- sizeof(common_u));
-
- CNNv1LayerParams layer_params = {
- .layer_index = layer,
- .blend_amount =
- (layer == NUM_LAYERS - 1) ? args.blend : 1.0f, // Only final layer
- ._pad = {0.0f, 0.0f},
+ if (args.sample_dir) {
+ printf("Mode: full (%s)\n", args.sample_dir);
+ auto path = [&](const char* name) -> std::string {
+ return std::string(args.sample_dir) + "/" + name;
};
- wgpuQueueWriteBuffer(queue, layer_params_buffer, 0, &layer_params,
- sizeof(layer_params));
-
- // Build bind group
- WGPUBindGroup bind_group =
- BindGroupBuilder()
- .sampler(0, sampler)
- .texture(1, current_input)
- .buffer(2, common_uniform_buffer, sizeof(UniformsSequenceParams))
- .buffer(3, layer_params_buffer, sizeof(CNNv1LayerParams))
- .texture(4, original_view)
- .build(device, bgl);
-
- // Render to appropriate output texture with correct pipeline
- bool is_final = (layer == NUM_LAYERS - 1);
-
- if (is_final) {
- // Final layer: use OffscreenRenderTarget (known working readback)
- OffscreenRenderTarget rt(instance, device, width, height);
- WGPUCommandEncoder encoder =
- wgpuDeviceCreateCommandEncoder(device, nullptr);
- WGPURenderPassEncoder pass = begin_render_pass(encoder, rt.view());
- wgpuRenderPassEncoderSetPipeline(pass, pipeline_final);
- wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
- wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
- wgpuRenderPassEncoderEnd(pass);
- WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
- wgpuQueueSubmit(queue, 1, &commands);
- wgpuDevicePoll(device, true, nullptr);
-
- wgpuCommandBufferRelease(commands);
- wgpuRenderPassEncoderRelease(pass);
- wgpuCommandEncoderRelease(encoder);
- wgpuBindGroupRelease(bind_group);
+ img.normal = load_png_rg(path("normal.png").c_str(), W, H);
+ img.depth = load_png_depth16(path("depth.png").c_str(), W, H);
+ img.matid = load_png_gray(path("matid.png").c_str(), W, H, 0.0f);
+ img.shadow = load_png_gray(path("shadow.png").c_str(), W, H, 1.0f);
+ img.transp = load_png_gray(path("transp.png").c_str(), W, H, 0.0f);
+ } else {
+ printf("Mode: simple (geometry zeroed, normal=(0.5,0.5))\n");
+ img.normal.assign(W * H * 2, 0.5f);
+ img.depth.assign(W * H, 0.0f);
+ img.matid.assign(W * H, 0.0f);
+ img.shadow.assign(W * H, 1.0f);
+ img.transp.assign(W * H, 0.0f);
+ }
- // Read pixels immediately
- printf("Reading pixels from GPU...\n");
- std::vector<uint8_t> pixels = rt.read_pixels();
+ // --- Pack features ---
+ std::vector<uint32_t> feat0, feat1;
+ pack_features(img, feat0, feat1);
- // Debug: print first 8 pixels as hex
- if (args.debug_hex && !pixels.empty()) {
- printf("First 8 pixels (BGRA hex):\n");
- for (int i = 0; i < 8 && i < width * height; ++i) {
- const uint8_t b = pixels[i * 4 + 0];
- const uint8_t g = pixels[i * 4 + 1];
- const uint8_t r = pixels[i * 4 + 2];
- const uint8_t a = pixels[i * 4 + 3];
- printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
- }
- }
+ // --- Create GPU textures ---
+ WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H);
+ WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H);
+ WGPUTexture out_tex = make_output_tex(ctx.device, W, H);
- if (pixels.empty()) {
- fprintf(stderr, "Error: GPU readback failed\n");
- wgpuTextureViewRelease(intermediate_views[0]);
- wgpuTextureViewRelease(intermediate_views[1]);
- wgpuTextureRelease(intermediate_textures[0]);
- wgpuTextureRelease(intermediate_textures[1]);
- wgpuTextureViewRelease(input_view);
- wgpuTextureRelease(input_texture);
- wgpuBufferRelease(layer_params_buffer);
- wgpuBufferRelease(common_uniform_buffer);
- wgpuBindGroupLayoutRelease(bgl);
- wgpuRenderPipelineRelease(pipeline_final);
- wgpuRenderPipelineRelease(pipeline_intermediate);
- SamplerCache::Get().clear();
- fixture.shutdown();
- return 1;
- }
+ WGPUTextureView feat0_view = make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint);
+ WGPUTextureView feat1_view = make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint);
+ WGPUTextureView out_view = make_view(out_tex, WGPUTextureFormat_RGBA16Float);
- // Save output
- bool success;
- if (args.output_png) {
- printf("Saving PNG to '%s'...\n", args.output_path);
- success = save_png(args.output_path, pixels, width, height);
- } else {
- printf("Saving PPM to '%s'...\n", args.output_path);
- success = save_ppm(args.output_path, pixels, width, height);
- }
+ upload_tex(ctx.queue, feat0_tex, feat0.data(), W, H);
+ upload_tex(ctx.queue, feat1_tex, feat1.data(), W, H);
- if (!success) {
- wgpuTextureViewRelease(intermediate_views[0]);
- wgpuTextureViewRelease(intermediate_views[1]);
- wgpuTextureRelease(intermediate_textures[0]);
- wgpuTextureRelease(intermediate_textures[1]);
- wgpuTextureViewRelease(input_view);
- wgpuTextureRelease(input_texture);
- wgpuBufferRelease(layer_params_buffer);
- wgpuBufferRelease(common_uniform_buffer);
- wgpuBindGroupLayoutRelease(bgl);
- wgpuRenderPipelineRelease(pipeline_final);
- wgpuRenderPipelineRelease(pipeline_intermediate);
- SamplerCache::Get().clear();
- fixture.shutdown();
- return 1;
- }
+ // --- Wire CNNv3Effect ---
+ NodeRegistry registry(ctx.device, W, H);
+ registry.set_external_view("feat0", feat0_view);
+ registry.set_external_view("feat1", feat1_view);
+ registry.set_external_view("cnn_out", out_view);
- printf("Done! Output saved to '%s'\n", args.output_path);
- break; // Exit loop after final layer
- } else {
- // Intermediate layers: render to ping-pong textures
- WGPUTextureView output_view = intermediate_views[dst_idx];
- WGPUCommandEncoder encoder =
- wgpuDeviceCreateCommandEncoder(device, nullptr);
- WGPURenderPassEncoder pass = begin_render_pass(encoder, output_view);
- wgpuRenderPassEncoderSetPipeline(pass, pipeline_intermediate);
- wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
- wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
- wgpuRenderPassEncoderEnd(pass);
- WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
- wgpuQueueSubmit(queue, 1, &commands);
- wgpuDevicePoll(device, true, nullptr);
+ CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn_out"}, 0.0f, 1000.0f);
+ effect.declare_nodes(registry);
- wgpuCommandBufferRelease(commands);
- wgpuRenderPassEncoderRelease(pass);
- wgpuCommandEncoderRelease(encoder);
- wgpuBindGroupRelease(bind_group);
+ // --- Load weights ---
+ if (args.weights_path) {
+ std::vector<uint32_t> wdata;
+ if (!load_weights_bin(args.weights_path, wdata)) return 1;
+ effect.upload_weights(ctx.queue, wdata.data(),
+ (uint32_t)(wdata.size() * 4));
+ printf("Weights: %s (%zu bytes)\n", args.weights_path, wdata.size() * 4);
+ } else {
+ printf("Weights: default (from assets, zero if absent)\n");
+ }
- // Save intermediate layer if requested
- if (args.save_intermediates) {
- char layer_path[512];
- snprintf(layer_path, sizeof(layer_path), "%s/layer_%d.png",
- args.save_intermediates, layer);
- printf("Saving intermediate layer %d to '%s'...\n", layer, layer_path);
+ // --- Run 5 compute passes ---
+ WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+ UniformsSequenceParams params = {};
+ params.resolution = {(float)W, (float)H};
+ params.aspect_ratio = (float)W / (float)H;
+ effect.render(enc, params, registry);
- // Readback RGBA16Float texture
- std::vector<uint8_t> pixels = texture_readback_fp16_to_u8(
- device, queue, intermediate_textures[dst_idx], width, height);
+ WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+ wgpuQueueSubmit(ctx.queue, 1, &cmds);
+ wgpuCommandBufferRelease(cmds);
+ wgpuCommandEncoderRelease(enc);
+ wgpuDevicePoll(ctx.device, true, nullptr);
- // Debug: print first 8 pixels as hex
- if (args.debug_hex && !pixels.empty()) {
- printf("Layer %d first 8 pixels (BGRA hex):\n", layer);
- for (int i = 0; i < 8 && i < width * height; ++i) {
- const uint8_t b = pixels[i * 4 + 0];
- const uint8_t g = pixels[i * 4 + 1];
- const uint8_t r = pixels[i * 4 + 2];
- const uint8_t a = pixels[i * 4 + 3];
- printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
- }
- }
+ // --- Readback ---
+ std::vector<float> pixels = readback_rgba16f(ctx.device, ctx.queue, out_tex, W, H);
- if (!pixels.empty()) {
- save_png(layer_path, pixels, width, height);
- } else {
- fprintf(stderr, "Warning: failed to read intermediate layer %d\n",
- layer);
- }
- }
- }
+ // --- Save output (crop to original size, already same if no padding) ---
+ if (!save_png(args.output_path, pixels, W, H)) return 1;
+ printf("Saved: %s\n", args.output_path);
- // Update for next layer: output becomes input
- if (layer < NUM_LAYERS - 1) {
- // Use this layer's output as next layer's input
- current_input = intermediate_views[dst_idx];
- dst_idx = 1 - dst_idx; // Flip ping-pong for next render
+ if (args.debug_hex) {
+ printf("First 8 output pixels (RGBA f32 → hex):\n");
+ for (int i = 0; i < 8 && i < W * H; ++i) {
+ float r = pixels[i*4 ], g = pixels[i*4+1];
+ float b = pixels[i*4+2], a = pixels[i*4+3];
+ int ri = (int)(r*255+.5f), gi = (int)(g*255+.5f);
+ int bi = (int)(b*255+.5f), ai = (int)(a*255+.5f);
+ ri = ri<0?0:ri>255?255:ri; gi = gi<0?0:gi>255?255:gi;
+ bi = bi<0?0:bi>255?255:bi; ai = ai<0?0:ai>255?255:ai;
+ printf(" [%d] 0x%02X%02X%02X%02X (%.4f %.4f %.4f %.4f)\n",
+ i, ri, gi, bi, ai, r, g, b, a);
}
}
- // Wait for all GPU work to complete before cleanup
- wgpuDevicePoll(device, true, nullptr);
-
// Cleanup
- wgpuTextureViewRelease(intermediate_views[0]);
- wgpuTextureViewRelease(intermediate_views[1]);
- wgpuTextureRelease(intermediate_textures[0]);
- wgpuTextureRelease(intermediate_textures[1]);
- wgpuBufferRelease(layer_params_buffer);
- wgpuBufferRelease(common_uniform_buffer);
- wgpuBindGroupLayoutRelease(bgl);
- wgpuRenderPipelineRelease(pipeline_intermediate);
- wgpuRenderPipelineRelease(pipeline_final);
- wgpuTextureViewRelease(input_view);
- wgpuTextureRelease(input_texture);
- SamplerCache::Get().clear();
- fixture.shutdown();
+ wgpuTextureViewRelease(feat0_view);
+ wgpuTextureViewRelease(feat1_view);
+ wgpuTextureViewRelease(out_view);
+ wgpuTextureRelease(feat0_tex);
+ wgpuTextureRelease(feat1_tex);
+ wgpuTextureRelease(out_tex);
return 0;
}