// CNN v3 shader testing tool — offline WGSL inference for Python parity checks. // Loads an input PNG (or sample directory), packs 20-channel features, runs the // CNNv3Effect (5 compute passes), and saves the RGBA16Float output as PNG. #if defined(STRIP_ALL) #error "cnn_test requires STRIP_ALL=OFF (tool builds only)" #endif #include "cnn_v3_effect.h" #include "generated/assets.h" #include "gpu/gpu.h" #include "gpu/sequence.h" #include "gpu/shader_composer.h" #include "tests/common/webgpu_test_fixture.h" #include "util/asset_manager.h" #include "util/mini_math.h" #include "stb_image.h" #include "stb_image_write.h" #include #include #include #include #include #include // --------------------------------------------------------------------------- // F16 / pack helpers (match WGSL pack2x16float / pack4x8unorm) // --------------------------------------------------------------------------- static uint16_t f32_to_f16(float f) { uint32_t b; memcpy(&b, &f, 4); uint32_t sign = (b >> 16) & 0x8000u; int32_t exp = (int32_t)((b >> 23) & 0xFFu) - 127 + 15; uint32_t mant = b & 0x7FFFFFu; if (exp <= 0) return (uint16_t)sign; if (exp >= 31) return (uint16_t)(sign | 0x7C00u); return (uint16_t)(sign | ((uint32_t)exp << 10) | (mant >> 13)); } // Low 16 bits = a, high 16 bits = b (matches WGSL pack2x16float(vec2f(a,b))) static uint32_t pack2x16f(float a, float b) { return (uint32_t)f32_to_f16(a) | ((uint32_t)f32_to_f16(b) << 16); } // RGBA as u8 packed into u32 (matches WGSL pack4x8unorm) static uint32_t pack4x8u(float a, float b, float c, float d) { auto u8 = [](float v) -> uint32_t { int i = (int)(v * 255.0f + 0.5f); if (i < 0) i = 0; if (i > 255) i = 255; return (uint32_t)i; }; return u8(a) | (u8(b) << 8) | (u8(c) << 16) | (u8(d) << 24); } // --------------------------------------------------------------------------- // Oct-decode [0,1] → unit normal (matches Python cnn_v3_utils.oct_decode) // --------------------------------------------------------------------------- static void oct_decode_01(float nx01, float ny01, float* out_x, float* out_y, float* out_z) { float fx = nx01 * 2.0f - 1.0f; float fy = ny01 * 2.0f - 1.0f; float fz = 1.0f - fabsf(fx) - fabsf(fy); if (fz < 0.0f) { float sx = fx >= 0.0f ? 1.0f : -1.0f; float sy = fy >= 0.0f ? 1.0f : -1.0f; fx = (1.0f - fabsf(fy)) * sx; fy = (1.0f - fabsf(fx)) * sy; } float len = sqrtf(fx*fx + fy*fy + fz*fz); if (len < 1e-8f) len = 1e-8f; *out_x = fx / len; *out_y = fy / len; *out_z = fz / len; } // --------------------------------------------------------------------------- // Mip helpers — matching Python pyrdown + nearest-upsample // --------------------------------------------------------------------------- // Compute mip1 and mip2 for each pixel using the Python convention: // mip1_small[y2][x2] = avg(rgb[2y2..2y2+1][2x2..2x2+1]) (half-res) // mip2_small[y4][x4] = avg(mip1[2y4..2y4+1][2x4..2x4+1]) (quarter-res) // Nearest upsample: mip1[y][x] = mip1_small[y/2][x/2], etc. // Output: mip1_out and mip2_out are (H*W*3) float arrays in row-major order. static void compute_mips(const float* rgb, int w, int h, std::vector& mip1_out, std::vector& mip2_out) { const int w2 = w / 2, h2 = h / 2; const int w4 = w / 4, h4 = h / 4; std::vector m1(w2 * h2 * 3); for (int y2 = 0; y2 < h2; ++y2) { for (int x2 = 0; x2 < w2; ++x2) { for (int c = 0; c < 3; ++c) { int y0 = y2 * 2, x0 = x2 * 2; float v = rgb[(y0 * w + x0 ) * 3 + c] + rgb[(y0 * w + x0+1) * 3 + c] + rgb[((y0+1) * w + x0 ) * 3 + c] + rgb[((y0+1) * w + x0+1) * 3 + c]; m1[(y2 * w2 + x2) * 3 + c] = v * 0.25f; } } } std::vector m2(w4 * h4 * 3); for (int y4 = 0; y4 < h4; ++y4) { for (int x4 = 0; x4 < w4; ++x4) { for (int c = 0; c < 3; ++c) { int y0 = y4 * 2, x0 = x4 * 2; float v = m1[(y0 * w2 + x0 ) * 3 + c] + m1[(y0 * w2 + x0+1) * 3 + c] + m1[((y0+1) * w2 + x0 ) * 3 + c] + m1[((y0+1) * w2 + x0+1) * 3 + c]; m2[(y4 * w4 + x4) * 3 + c] = v * 0.25f; } } } // Nearest upsample to full-res mip1_out.resize(w * h * 3); mip2_out.resize(w * h * 3); for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { int i = (y * w + x) * 3; int i1 = ((y/2) * w2 + (x/2)) * 3; int i2 = ((y/4) * w4 + (x/4)) * 3; mip1_out[i ] = (y/2 < h2 && x/2 < w2) ? m1[i1 ] : 0.0f; mip1_out[i+1] = (y/2 < h2 && x/2 < w2) ? m1[i1+1] : 0.0f; mip1_out[i+2] = (y/2 < h2 && x/2 < w2) ? m1[i1+2] : 0.0f; mip2_out[i ] = (y/4 < h4 && x/4 < w4) ? m2[i2 ] : 0.0f; mip2_out[i+1] = (y/4 < h4 && x/4 < w4) ? m2[i2+1] : 0.0f; mip2_out[i+2] = (y/4 < h4 && x/4 < w4) ? m2[i2+2] : 0.0f; } } } // --------------------------------------------------------------------------- // Feature packing: RGB float arrays → feat_tex0 / feat_tex1 (rgba32uint) // // feat_tex0 (4 u32, f16 pairs — matches load_feat in cnn_v3_enc0.wgsl): // [0] albedo.r | albedo.g // [1] albedo.b | normal.x (oct, [0,1] — training format) // [2] normal.y | depth // [3] dzdx | dzdy // // feat_tex1 (4 u32, u8norm — channel order from cnn_v3_enc0.wgsl load_feat): // [0] mat_id | prev.r | prev.g | prev.b // [1] mip1.r | mip1.g | mip1.b | mip2.r // [2] mip2.g | mip2.b | dif | transp // [3] 0 // // Note: normal.xy stored in [0,1] (training format), NOT remapped to [-1,1] // like gbuf_pack.wgsl does at runtime. This matches infer_cnn_v3.py. // --------------------------------------------------------------------------- struct FeatureImages { int w, h; std::vector albedo; // w*h*3 [0,1] std::vector normal; // w*h*2 [0,1] oct-encoded std::vector depth; // w*h [0,1] std::vector matid; // w*h [0,1] std::vector shadow; // w*h [0,1] std::vector transp; // w*h [0,1] }; static void pack_features(const FeatureImages& img, std::vector& feat0, // w*h*4 u32 std::vector& feat1) // w*h*4 u32 { const int W = img.w, H = img.h; feat0.resize(W * H * 4); feat1.resize(W * H * 4); std::vector mip1, mip2; compute_mips(img.albedo.data(), W, H, mip1, mip2); static const float KEY_X = 0.408f, KEY_Y = 0.816f, KEY_Z = 0.408f; for (int y = 0; y < H; ++y) { for (int x = 0; x < W; ++x) { const int pi = y * W + x; const int i3 = pi * 3; const int i4 = pi * 4; float ar = img.albedo[i3 ]; float ag = img.albedo[i3+1]; float ab = img.albedo[i3+2]; float nx = img.normal[pi * 2 ]; // [0,1] float ny = img.normal[pi * 2 + 1]; // [0,1] float d = img.depth[pi]; // Central finite difference depth gradient int xm = (x > 0) ? x-1 : 0; int xp = (x < W-1) ? x+1 : W-1; int ym = (y > 0) ? y-1 : 0; int yp = (y < H-1) ? y+1 : H-1; float dzdx = (img.depth[y * W + xp] - img.depth[y * W + xm]) * 0.5f; float dzdy = (img.depth[yp * W + x ] - img.depth[ym * W + x ]) * 0.5f; float mat = img.matid[pi]; float shad = img.shadow[pi]; float trp = img.transp[pi]; // Diffuse = max(0, dot(oct_decode(normal), KEY_LIGHT)) * shadow float n3x, n3y, n3z; oct_decode_01(nx, ny, &n3x, &n3y, &n3z); float dif = fmaxf(0.0f, n3x*KEY_X + n3y*KEY_Y + n3z*KEY_Z) * shad; float m1r = mip1[i3 ], m1g = mip1[i3+1], m1b = mip1[i3+2]; float m2r = mip2[i3 ], m2g = mip2[i3+1], m2b = mip2[i3+2]; // prev.rgb = 0 (no temporal history) feat0[i4 ] = pack2x16f(ar, ag); feat0[i4+1] = pack2x16f(ab, nx); feat0[i4+2] = pack2x16f(ny, d ); feat0[i4+3] = pack2x16f(dzdx, dzdy); feat1[i4 ] = pack4x8u(mat, 0.0f, 0.0f, 0.0f); // mat_id, prev.rgb=0 feat1[i4+1] = pack4x8u(m1r, m1g, m1b, m2r); feat1[i4+2] = pack4x8u(m2g, m2b, dif, trp); feat1[i4+3] = 0u; } } } // --------------------------------------------------------------------------- // GPU texture helpers // --------------------------------------------------------------------------- static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) { WGPUTextureDescriptor d = {}; d.format = WGPUTextureFormat_RGBA32Uint; d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; d.dimension = WGPUTextureDimension_2D; d.size = {(uint32_t)W, (uint32_t)H, 1}; d.mipLevelCount = 1; d.sampleCount = 1; return wgpuDeviceCreateTexture(dev, &d); } static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) { WGPUTextureDescriptor d = {}; d.format = WGPUTextureFormat_RGBA16Float; d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; d.dimension = WGPUTextureDimension_2D; d.size = {(uint32_t)W, (uint32_t)H, 1}; d.mipLevelCount = 1; d.sampleCount = 1; return wgpuDeviceCreateTexture(dev, &d); } static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { WGPUTextureViewDescriptor d = {}; d.format = fmt; d.dimension = WGPUTextureViewDimension_2D; d.mipLevelCount = 1; d.arrayLayerCount = 1; return wgpuTextureCreateView(tex, &d); } static void upload_tex(WGPUQueue queue, WGPUTexture tex, const uint32_t* data, int W, int H) { WGPUTexelCopyTextureInfo dst = {}; dst.texture = tex; WGPUTexelCopyBufferLayout layout = {}; layout.bytesPerRow = (uint32_t)(W * 16); layout.rowsPerImage = (uint32_t)H; WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1}; wgpuQueueWriteTexture(queue, &dst, data, (size_t)(W * H * 16), &layout, &ext); } // --------------------------------------------------------------------------- // RGBA16Float readback // --------------------------------------------------------------------------- static uint16_t fp16_bits_to_f16(float f) { return f32_to_f16(f); } static float fp16_bits_to_f32(uint16_t h) { uint32_t sign = (uint32_t)(h & 0x8000u) << 16; uint32_t exp = (h & 0x7C00u) >> 10; uint32_t mant = h & 0x03FFu; if (exp == 0 && mant == 0) { float r; memcpy(&r, &sign, 4); return r; } if (exp == 31) { uint32_t b = sign | 0x7F800000u | (mant << 13); float r; memcpy(&r, &b, 4); return r; } uint32_t b = sign | ((exp + 112u) << 23) | (mant << 13); float r; memcpy(&r, &b, 4); return r; } struct MapState { bool done = false; WGPUMapAsyncStatus status = {}; }; static std::vector readback_rgba16f(WGPUDevice device, WGPUQueue queue, WGPUTexture tex, int W, int H) { const uint32_t bytes_per_px = 8; const uint32_t raw_bpr = (uint32_t)(W * bytes_per_px); const uint32_t aligned_bpr = ((raw_bpr + 255u) / 256u) * 256u; const size_t buf_size = (size_t)aligned_bpr * (size_t)H; WGPUBufferDescriptor bd = {}; bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; bd.size = buf_size; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); WGPUTexelCopyTextureInfo src = {}; src.texture = tex; WGPUTexelCopyBufferInfo dst = {}; dst.buffer = staging; dst.layout.bytesPerRow = aligned_bpr; dst.layout.rowsPerImage = (uint32_t)H; WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1}; wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &ext); WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); wgpuQueueSubmit(queue, 1, &cmds); wgpuCommandBufferRelease(cmds); wgpuCommandEncoderRelease(enc); wgpuDevicePoll(device, true, nullptr); MapState ms = {}; WGPUBufferMapCallbackInfo mi = {}; mi.mode = WGPUCallbackMode_AllowProcessEvents; mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { auto* st = (MapState*)u; st->status = s; st->done = true; }; mi.userdata1 = &ms; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); for (int i = 0; i < 200 && !ms.done; ++i) wgpuDevicePoll(device, true, nullptr); std::vector pixels(W * H * 4, 0.0f); if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { const uint8_t* mapped = (const uint8_t*) wgpuBufferGetConstMappedRange(staging, 0, buf_size); if (mapped) { for (int y = 0; y < H; ++y) { const uint16_t* row = (const uint16_t*)(mapped + (size_t)y * aligned_bpr); for (int x = 0; x < W; ++x) { for (int c = 0; c < 4; ++c) pixels[(y * W + x) * 4 + c] = fp16_bits_to_f32(row[x * 4 + c]); } } } } wgpuBufferUnmap(staging); wgpuBufferRelease(staging); return pixels; } // --------------------------------------------------------------------------- // Image I/O helpers // --------------------------------------------------------------------------- static std::vector load_png_rgb(const char* path, int* out_w, int* out_h) { int w, h, ch; uint8_t* data = stbi_load(path, &w, &h, &ch, 3); if (!data) { fprintf(stderr, "Error: cannot load '%s'\n", path); return {}; } *out_w = w; *out_h = h; std::vector out(w * h * 3); for (int i = 0; i < w * h * 3; ++i) out[i] = data[i] / 255.0f; stbi_image_free(data); return out; } // Load 2-channel (RG) from RGB PNG — takes first 2 channels static std::vector load_png_rg(const char* path, int ew, int eh) { int w, h, ch; uint8_t* data = stbi_load(path, &w, &h, &ch, 3); if (!data || w != ew || h != eh) { if (data) stbi_image_free(data); fprintf(stderr, "Warning: cannot load normal '%s' — using (0.5,0.5)\n", path); std::vector def(ew * eh * 2, 0.5f); return def; } std::vector out(w * h * 2); for (int i = 0; i < w * h; ++i) { out[i * 2 ] = data[i * 3 ] / 255.0f; out[i * 2 + 1] = data[i * 3 + 1] / 255.0f; } stbi_image_free(data); return out; } // Load 16-bit greyscale PNG → [0,1] static std::vector load_png_depth16(const char* path, int ew, int eh) { int w, h, ch; uint16_t* data = stbi_load_16(path, &w, &h, &ch, 1); if (!data || w != ew || h != eh) { if (data) stbi_image_free(data); fprintf(stderr, "Warning: cannot load depth '%s' — using 0\n", path); return std::vector(ew * eh, 0.0f); } std::vector out(w * h); for (int i = 0; i < w * h; ++i) out[i] = data[i] / 65535.0f; stbi_image_free(data); return out; } // Load 8-bit greyscale PNG → [0,1] static std::vector load_png_gray(const char* path, int ew, int eh, float default_val = 0.0f) { int w, h, ch; uint8_t* data = stbi_load(path, &w, &h, &ch, 1); if (!data || w != ew || h != eh) { if (data) stbi_image_free(data); return std::vector(ew * eh, default_val); } std::vector out(w * h); for (int i = 0; i < w * h; ++i) out[i] = data[i] / 255.0f; stbi_image_free(data); return out; } static bool save_png(const char* path, const std::vector& rgba_f32, int w, int h) { std::vector rgba8(w * h * 4); for (int i = 0; i < w * h * 4; ++i) { int v = (int)(rgba_f32[i] * 255.0f + 0.5f); rgba8[i] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v); } if (!stbi_write_png(path, w, h, 4, rgba8.data(), w * 4)) { fprintf(stderr, "Error: failed to write '%s'\n", path); return false; } return true; } // --------------------------------------------------------------------------- // Weight loading // --------------------------------------------------------------------------- static bool load_weights_bin(const char* path, std::vector& out) { FILE* f = fopen(path, "rb"); if (!f) { fprintf(stderr, "Error: cannot open weights '%s'\n", path); return false; } fseek(f, 0, SEEK_END); long sz = ftell(f); rewind(f); if (sz <= 0 || sz % 4 != 0) { fprintf(stderr, "Error: bad weights file size %ld\n", sz); fclose(f); return false; } out.resize((size_t)sz / 4); if ((long)fread(out.data(), 4, out.size(), f) != sz / 4) { fprintf(stderr, "Error: read failed for '%s'\n", path); fclose(f); return false; } fclose(f); return true; } // --------------------------------------------------------------------------- // Args // --------------------------------------------------------------------------- struct Args { const char* input_path = nullptr; const char* output_path = nullptr; const char* sample_dir = nullptr; const char* weights_path = nullptr; bool debug_hex = false; }; static void print_usage(const char* prog) { fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog); fprintf(stderr, "\nOPTIONS:\n"); fprintf(stderr, " --sample-dir DIR Full sample dir with albedo/normal/depth/matid/shadow/transp\n"); fprintf(stderr, " --weights FILE Load weights from cnn_v3_weights.bin\n"); fprintf(stderr, " --debug-hex Print first 8 output pixels as hex\n"); fprintf(stderr, " --help Show this help\n"); fprintf(stderr, "\nSimple mode (single PNG): geometry channels zeroed, normal=(0.5,0.5).\n"); fprintf(stderr, "FiLM is always identity (gamma=1, beta=0).\n"); fprintf(stderr, "\nNote: feature packing uses [0,1] oct-normals (training format) to match\n"); fprintf(stderr, " infer_cnn_v3.py for direct Python/WGSL comparison.\n"); } static bool parse_args(int argc, char** argv, Args* args) { if (argc < 3) return false; args->input_path = argv[1]; args->output_path = argv[2]; for (int i = 3; i < argc; ++i) { if (strcmp(argv[i], "--sample-dir") == 0 && i + 1 < argc) { args->sample_dir = argv[++i]; } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) { args->weights_path = argv[++i]; } else if (strcmp(argv[i], "--debug-hex") == 0) { args->debug_hex = true; } else if (strcmp(argv[i], "--help") == 0) { return false; } else { fprintf(stderr, "Error: unknown option '%s'\n", argv[i]); return false; } } return true; } // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- extern void InitShaderComposer(); int main(int argc, char** argv) { Args args; if (!parse_args(argc, argv, &args)) { print_usage(argv[0]); return 1; } // Init GPU WebGPUTestFixture fixture; if (!fixture.init()) { fprintf(stderr, "Error: WebGPU device unavailable\n"); return 1; } InitShaderComposer(); GpuContext ctx = fixture.ctx(); // --- Load input image --- int W, H; std::vector albedo = load_png_rgb(args.input_path, &W, &H); if (albedo.empty()) return 1; // Pad to multiples of 4 (U-Net requires 2 pooling levels) const int W4 = (W + 3) & ~3; const int H4 = (H + 3) & ~3; if (W4 != W || H4 != H) { printf("Padding %dx%d → %dx%d\n", W, H, W4, H4); std::vector padded(W4 * H4 * 3, 0.0f); for (int y = 0; y < H; ++y) for (int x = 0; x < W; ++x) for (int c = 0; c < 3; ++c) padded[(y * W4 + x) * 3 + c] = albedo[(y * W + x) * 3 + c]; albedo = std::move(padded); W = W4; H = H4; } printf("Input: %s (%dx%d)\n", args.input_path, W, H); // --- Build FeatureImages --- FeatureImages img; img.w = W; img.h = H; img.albedo = albedo; if (args.sample_dir) { printf("Mode: full (%s)\n", args.sample_dir); auto path = [&](const char* name) -> std::string { return std::string(args.sample_dir) + "/" + name; }; img.normal = load_png_rg(path("normal.png").c_str(), W, H); img.depth = load_png_depth16(path("depth.png").c_str(), W, H); img.matid = load_png_gray(path("matid.png").c_str(), W, H, 0.0f); img.shadow = load_png_gray(path("shadow.png").c_str(), W, H, 1.0f); img.transp = load_png_gray(path("transp.png").c_str(), W, H, 0.0f); } else { printf("Mode: simple (geometry zeroed, normal=(0.5,0.5))\n"); img.normal.assign(W * H * 2, 0.5f); img.depth.assign(W * H, 0.0f); img.matid.assign(W * H, 0.0f); img.shadow.assign(W * H, 1.0f); img.transp.assign(W * H, 0.0f); } // --- Pack features --- std::vector feat0, feat1; pack_features(img, feat0, feat1); // --- Create GPU textures --- WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H); WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H); WGPUTexture out_tex = make_output_tex(ctx.device, W, H); WGPUTextureView feat0_view = make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint); WGPUTextureView feat1_view = make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint); WGPUTextureView out_view = make_view(out_tex, WGPUTextureFormat_RGBA16Float); upload_tex(ctx.queue, feat0_tex, feat0.data(), W, H); upload_tex(ctx.queue, feat1_tex, feat1.data(), W, H); // --- Wire CNNv3Effect --- NodeRegistry registry(ctx.device, W, H); registry.set_external_view("feat0", feat0_view); registry.set_external_view("feat1", feat1_view); registry.set_external_view("cnn_out", out_view); CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn_out"}, 0.0f, 1000.0f); effect.declare_nodes(registry); // --- Load weights --- if (args.weights_path) { std::vector wdata; if (!load_weights_bin(args.weights_path, wdata)) return 1; effect.upload_weights(ctx.queue, wdata.data(), (uint32_t)(wdata.size() * 4)); printf("Weights: %s (%zu bytes)\n", args.weights_path, wdata.size() * 4); } else { printf("Weights: default (from assets, zero if absent)\n"); } // --- Run 5 compute passes --- WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); UniformsSequenceParams params = {}; params.resolution = {(float)W, (float)H}; params.aspect_ratio = (float)W / (float)H; effect.render(enc, params, registry); WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); wgpuQueueSubmit(ctx.queue, 1, &cmds); wgpuCommandBufferRelease(cmds); wgpuCommandEncoderRelease(enc); wgpuDevicePoll(ctx.device, true, nullptr); // --- Readback --- std::vector pixels = readback_rgba16f(ctx.device, ctx.queue, out_tex, W, H); // --- Save output (crop to original size, already same if no padding) --- if (!save_png(args.output_path, pixels, W, H)) return 1; printf("Saved: %s\n", args.output_path); if (args.debug_hex) { printf("First 8 output pixels (RGBA f32 → hex):\n"); for (int i = 0; i < 8 && i < W * H; ++i) { float r = pixels[i*4 ], g = pixels[i*4+1]; float b = pixels[i*4+2], a = pixels[i*4+3]; int ri = (int)(r*255+.5f), gi = (int)(g*255+.5f); int bi = (int)(b*255+.5f), ai = (int)(a*255+.5f); ri = ri<0?0:ri>255?255:ri; gi = gi<0?0:gi>255?255:gi; bi = bi<0?0:bi>255?255:bi; ai = ai<0?0:ai>255?255:ai; printf(" [%d] 0x%02X%02X%02X%02X (%.4f %.4f %.4f %.4f)\n", i, ri, gi, bi, ai, r, g, b, a); } } // Cleanup wgpuTextureViewRelease(feat0_view); wgpuTextureViewRelease(feat1_view); wgpuTextureViewRelease(out_view); wgpuTextureRelease(feat0_tex); wgpuTextureRelease(feat1_tex); wgpuTextureRelease(out_tex); return 0; }