13 files changed, 1049 insertions, 1518 deletions
diff --git a/cmake/DemoTests.cmake b/cmake/DemoTests.cmake
index 69b9195..59859c5 100644
--- a/cmake/DemoTests.cmake
+++ b/cmake/DemoTests.cmake
@@ -196,17 +196,17 @@ add_demo_test(test_gpu_procedural GpuProceduralTest gpu
 target_link_libraries(test_gpu_procedural PRIVATE 3d gpu audio procedural util ${DEMO_LIBS})
 demo_add_asset_deps(test_gpu_procedural shaders)
 
-# CNN shader testing tool (only when STRIP_ALL is OFF and workspace is main)
+# CNN v3 shader testing tool (only when STRIP_ALL is OFF and workspace is main)
 if(NOT DEMO_STRIP_ALL AND DEMO_WORKSPACE STREQUAL "main")
     add_executable(cnn_test
         tools/cnn_test.cc
         src/tests/common/webgpu_test_fixture.cc
-        src/tests/common/offscreen_render_target.cc
         ${PLATFORM_SOURCES}
         ${GEN_DEMO_CC})
 
     target_include_directories(cnn_test PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/src
+        ${CMAKE_CURRENT_SOURCE_DIR}/cnn_v3/src
         ${CMAKE_CURRENT_SOURCE_DIR}/third_party
         ${CMAKE_CURRENT_BINARY_DIR}/src/generated
         ${CORE_INCLUDES})
diff --git a/cnn_v3/docs/HOWTO.md b/cnn_v3/docs/HOWTO.md
index 5cfc371..1aead68 100644
--- a/cnn_v3/docs/HOWTO.md
+++ b/cnn_v3/docs/HOWTO.md
@@ -233,12 +233,13 @@ channel-dropout training.
 
 ```bash
 python3 cnn_v3/training/pack_photo_sample.py \
-    --photo cnn_v3/training/input/photo1.jpg \
+    --photo  input/photo1.jpg \
+    --target target/photo1_styled.png \
     --output dataset/photos/sample_001/
 ```
 
-The output `target.png` defaults to the input photo (no style). Copy in
-your stylized version as `target.png` before training.
+`--target` is required and must be a stylized ground-truth image at the same
+resolution as the photo. The script writes it as `target.png` in the sample dir.
 
 ### Dataset layout
 
@@ -285,10 +286,31 @@ python3 train_cnn_v3.py \
     --patch-size 32 --detector random
 ```
 
+### Single-sample training
+
+Use `--single-sample <dir>` to train on one specific sample directory.
+Implies `--full-image` and `--batch-size 1` automatically.
+
+```bash
+# Pack input/target pair into a sample directory first
+python3 pack_photo_sample.py \
+    --photo  input/photo1.png \
+    --target target/photo1_styled.png \
+    --output dataset/simple/sample_001/
+
+# Train on that sample only
+python3 train_cnn_v3.py \
+    --single-sample dataset/simple/sample_001/ \
+    --epochs 500
+```
+
+All other flags (`--epochs`, `--lr`, `--checkpoint-dir`, `--enc-channels`, etc.) work normally.
+
 ### Key flags
 
 | Flag | Default | Notes |
 |------|---------|-------|
+| `--single-sample DIR` | — | Train on one sample dir; implies `--full-image`, `--batch-size 1` |
 | `--input DIR` | `training/dataset` | Root with `full/` or `simple/` subdirs |
 | `--input-mode` | `simple` | `simple`=photos, `full`=Blender G-buffer |
 | `--patch-size N` | `64` | Patch crop size |
@@ -587,9 +609,145 @@ Visualization panel still works.
 
 ---
 
-## 10. See Also
+## 10. Python / WGSL Parity Check (infer_cnn_v3 + cnn_test)
+
+Two complementary tools for comparing PyTorch inference against the live WGSL
+compute shaders on the same input image.
+
+### 10a. infer_cnn_v3.py — PyTorch reference inference
+
+**Location:** `cnn_v3/training/infer_cnn_v3.py`
+
+Runs the trained `CNNv3` model in Python and saves the RGBA output as PNG.
+
+**Simple mode** (single PNG, geometry zeroed):
+```bash
+cd cnn_v3/training
+python3 infer_cnn_v3.py photo.png out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth
+```
+
+**Full mode** (sample directory with all G-buffer files):
+```bash
+python3 infer_cnn_v3.py dataset/simple/sample_000/ out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth
+```
+
+**Identity FiLM** — bypass MLP, use γ=1 β=0 (matches C++ `cnn_test` default):
+```bash
+python3 infer_cnn_v3.py photo.png out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth \
+    --identity-film
+```
+
+**Options:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--checkpoint CKPT` | auto-find latest | Path to `.pth` checkpoint |
+| `--enc-channels C` | from checkpoint | `4,8` — must match training config |
+| `--cond F F F F F` | `0 0 0 0 0` | FiLM conditioning (beat_phase, beat_norm, audio, style0, style1) |
+| `--identity-film` | off | Bypass FiLM MLP, use γ=1 β=0 |
+| `--blend F` | `1.0` | Blend with albedo: 0=input, 1=CNN |
+| `--debug-hex` | off | Print first 8 output pixels as hex |
+
+In **simple mode**, geometry channels are zeroed: `normal=(0.5,0.5)` (oct-encodes
+to ≈(0,0,1)), `depth=0`, `matid=0`, `shadow=1`, `transp=0`.
+
+The checkpoint `config` dict (saved by `train_cnn_v3.py`) sets `enc_channels`
+and `film_cond_dim` automatically; `--enc-channels` is only needed if the
+checkpoint lacks a config key.
+
+---
+
+### 10b. cnn_test — WGSL / GPU reference inference
+
+**Location:** `tools/cnn_test.cc`  **Binary:** `build/cnn_test`
+
+Packs the same 20-channel feature tensor as `infer_cnn_v3.py`, uploads it to
+GPU, runs the five `CNNv3Effect` compute passes, and saves the RGBA16Float
+output as PNG.
+
+**Build** (requires `DEMO_BUILD_TESTS=ON` or `DEMO_WORKSPACE=main`):
+```bash
+cmake -B build -DDEMO_BUILD_TESTS=ON && cmake --build build -j4 --target cnn_test
+```
+
+**Simple mode:**
+```bash
+./build/cnn_test photo.png out_gpu.png --weights workspaces/main/weights/cnn_v3_weights.bin
+```
+
+**Full mode** (sample directory):
+```bash
+./build/cnn_test dataset/simple/sample_000/albedo.png out_gpu.png \
+    --sample-dir dataset/simple/sample_000/ \
+    --weights workspaces/main/weights/cnn_v3_weights.bin
+```
+
+**Options:**
+
+| Flag | Description |
+|------|-------------|
+| `--sample-dir DIR` | Load all G-buffer files (albedo/normal/depth/matid/shadow/transp) |
+| `--weights FILE` | `cnn_v3_weights.bin` (uses asset-embedded weights if omitted) |
+| `--debug-hex` | Print first 8 output pixels as hex |
+| `--help` | Show usage |
+
+FiLM is always **identity** (γ=1, β=0) — matching the C++ `CNNv3Effect` default
+until GPU-side FiLM MLP evaluation is added.
+
+---
+
+### 10c. Side-by-side comparison
+
+For a pixel-accurate comparison, use `--identity-film` in Python and `--debug-hex`
+in both tools:
+
+```bash
+cd cnn_v3/training
+
+# 1. Python inference (identity FiLM)
+python3 infer_cnn_v3.py photo.png out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth \
+    --identity-film --debug-hex
+
+# 2. GPU inference (always identity FiLM)
+./build/cnn_test photo.png out_gpu.png \
+    --weights workspaces/main/weights/cnn_v3_weights.bin \
+    --debug-hex
+```
+
+Both tools print first 8 pixels in the same format:
+```
+  [0] 0x7F804000  (0.4980 0.5020 0.2510 0.0000)
+```
+
+**Expected delta:** ≤ 1/255 (≈ 4e-3) per channel, matching the parity test
+(`test_cnn_v3_parity`). Larger deltas indicate a weight mismatch — re-export
+with `export_cnn_v3_weights.py` and verify the `.bin` size is 3928 bytes.
+
+---
+
+### 10d. Feature format note
+
+Both tools pack features in **training format** ([0,1] oct-encoded normals),
+not the runtime `gbuf_pack.wgsl` format (which remaps normals to [-1,1]).
+This makes `infer_cnn_v3.py` ↔ `cnn_test` directly comparable.
+
+The live pipeline (`GBufferEffect → gbuf_pack.wgsl → CNNv3Effect`) uses [-1,1]
+normals — that is the intended inference distribution after a full training run
+with `--input-mode full` (Blender renders). For training on photos
+(`--input-mode simple`), [0,1] normals are correct since channel dropout
+teaches the network to handle absent geometry.
+
+---
+
+## 11. See Also
 
 - `cnn_v3/docs/CNN_V3.md` — Full architecture design (U-Net, FiLM, feature layout)
 - `doc/EFFECT_WORKFLOW.md` — General effect integration guide
 - `cnn_v2/docs/CNN_V2.md` — Reference implementation (simpler, operational)
 - `src/tests/gpu/test_demo_effects.cc` — GBufferEffect + GBufViewEffect tests
+- `src/tests/gpu/test_cnn_v3_parity.cc` — Zero/random weight parity tests
+- `cnn_v3/training/export_cnn_v3_weights.py` — Export trained checkpoint → `.bin`
diff --git a/cnn_v3/docs/HOW_TO_CNN.md b/cnn_v3/docs/HOW_TO_CNN.md
index 4966a61..f5f1b1a 100644
--- a/cnn_v3/docs/HOW_TO_CNN.md
+++ b/cnn_v3/docs/HOW_TO_CNN.md
@@ -58,13 +58,13 @@ Input: 20-channel G-buffer feature textures (rgba32uint)
 ```
 photos/Blender → pack → dataset/ → train_cnn_v3.py → checkpoint.pth
                                                             │
-                                           export_cnn_v3_weights.py
-                                                  ┌─────────┴──────────┐
-                                          cnn_v3_weights.bin    cnn_v3_film_mlp.bin
-                                                  │
-                                     CNNv3Effect::upload_weights()
-                                                  │
-                                         demo / HTML tool
+                                           export_cnn_v3_weights.py [--html]
+                                      ┌──────────┴────────────┬──────────────┐
+                               cnn_v3_weights.bin   cnn_v3_film_mlp.bin   weights.js
+                                      │                                  (HTML tool
+                         CNNv3Effect::upload_weights()                    defaults)
+                                      │
+                                    demo
 ```
 
 ---
@@ -107,15 +107,6 @@ The network learns the mapping `albedo → target`. If you pass the same image a
 input and target, the network learns identity (useful as sanity check, not for real
 training). Confirm `target.png` looks correct before running training.
 
-**Alternative — pack without a target yet:**
-```bash
-python3 pack_photo_sample.py \
-    --photo /path/to/photo.png \
-    --output dataset/simple/sample_001/
-# target.png defaults to a copy of the input; replace it before training:
-cp my_stylized_version.png dataset/simple/sample_001/target.png
-```
-
 **Batch packing:**
 ```bash
 for f in photos/*.png; do
@@ -284,29 +275,36 @@ The U-Net conv weights and FiLM MLP train **jointly** in a single run. No separa
 
 ### Prerequisites
 
+`train_cnn_v3.py` and `export_cnn_v3_weights.py` carry inline `uv` dependency metadata
+(`# /// script`). Use `uv run` — no manual `pip install` needed:
+
 ```bash
-pip install torch torchvision pillow numpy opencv-python
 cd cnn_v3/training
+uv run train_cnn_v3.py --input dataset/ --epochs 1 --patch-size 32 --detector random
 ```
 
-**With `uv` (no pip needed):** dependencies are declared inline in `train_cnn_v3.py`
-and installed automatically on first run:
+**Without `uv` (manual pip):**
 ```bash
+pip install torch torchvision pillow numpy opencv-python
 cd cnn_v3/training
-uv run train_cnn_v3.py --input dataset/ --epochs 1 --patch-size 32 --detector random
+python3 train_cnn_v3.py ...
 ```
 
+The pack scripts (`pack_photo_sample.py`, `pack_blender_sample.py`) and
+`gen_test_vectors.py` do **not** have uv metadata — run them with `python3` directly
+(they only need `numpy`, `pillow`, and optionally `openexr`).
+
 ### Quick-start commands
 
 **Smoke test — 1 epoch, validates end-to-end without GPU:**
 ```bash
-python3 train_cnn_v3.py --input dataset/ --epochs 1 \
+uv run train_cnn_v3.py --input dataset/ --epochs 1 \
     --patch-size 32 --detector random
 ```
 
 **Standard photo training (patch-based):**
 ```bash
-python3 train_cnn_v3.py \
+uv run train_cnn_v3.py \
     --input dataset/ \
     --input-mode simple \
     --epochs 200
@@ -314,7 +312,7 @@ python3 train_cnn_v3.py \
 
 **Blender G-buffer training:**
 ```bash
-python3 train_cnn_v3.py \
+uv run train_cnn_v3.py \
     --input dataset/ \
     --input-mode full \
     --epochs 200
@@ -322,17 +320,29 @@ python3 train_cnn_v3.py \
 
 **Full-image mode (better global coherence, slower):**
 ```bash
-python3 train_cnn_v3.py \
+uv run train_cnn_v3.py \
     --input dataset/ \
     --input-mode full \
     --full-image --image-size 256 \
     --epochs 500
 ```
 
+**Single-sample training (overfit on one input/target pair):**
+```bash
+# Pack first
+./gen_sample.sh input/photo.png target/photo_styled.png dataset/simple/sample_001/
+
+# Train — --full-image and --batch-size 1 are implied
+uv run train_cnn_v3.py \
+    --single-sample dataset/simple/sample_001/ \
+    --epochs 500
+```
+
 ### Flag reference
 
 | Flag | Default | Notes |
 |------|---------|-------|
+| `--single-sample DIR` | — | Train on one sample dir; implies `--full-image`, `--batch-size 1` |
 | `--input DIR` | `training/dataset` | Dataset root; always set explicitly |
 | `--input-mode` | `simple` | `simple`=photos, `full`=Blender G-buffer |
 | `--epochs N` | 200 | 500 recommended for full-image mode |
@@ -340,7 +350,8 @@ python3 train_cnn_v3.py \
 | `--lr F` | 1e-3 | Reduce to 1e-4 if loss oscillates or NaN |
 | `--patch-size N` | 64 | Smaller = faster epoch, less spatial context |
 | `--patches-per-image N` | 256 | Reduce for small datasets |
-| `--detector` | `harris` | `random` for smoke tests; `shi-tomasi` as alternative |
+| `--detector` | `harris` | `random` for smoke tests; also `shi-tomasi`, `fast`, `gradient` |
+| `--patch-search-window N` | 0 | Search ±N px in target to find best alignment (grayscale MSE) per patch; 0=disabled. Use when source and target are not perfectly co-registered (e.g. photo + hand-painted target). Offsets cached at dataset init. |
 | `--channel-dropout-p F` | 0.3 | Lower if all samples have geometry (Blender only) |
 | `--full-image` | off | Resize full image instead of patch crops |
 | `--image-size N` | 256 | Resize target; only used with `--full-image` |
@@ -348,6 +359,7 @@ python3 train_cnn_v3.py \
 | `--film-cond-dim N` | 5 | Must match `CNNv3FiLMParams` field count in C++ |
 | `--checkpoint-dir DIR` | `checkpoints/` | Set per-experiment |
 | `--checkpoint-every N` | 50 | 0 to disable intermediate checkpoints |
+| `--resume [CKPT]` | — | Resume from checkpoint path; if path missing, uses latest in `--checkpoint-dir` |
 
 ### Architecture at startup
 
@@ -454,14 +466,27 @@ The final checkpoint is always written even if `--checkpoint-every 0`.
 
 ## 3. Exporting Weights
 
-Converts a trained `.pth` checkpoint to two raw binary files for the C++ runtime.
+Converts a trained `.pth` checkpoint to two raw binary files for the C++ runtime,
+and optionally updates the HTML tool's embedded defaults.
 
+**Standard export (C++ runtime only):**
 ```bash
 cd cnn_v3/training
-python3 export_cnn_v3_weights.py checkpoints/checkpoint_epoch_200.pth \
+uv run export_cnn_v3_weights.py checkpoints/checkpoint_epoch_200.pth \
     --output ../../workspaces/main/weights/
 ```
 
+**Export + update HTML tool defaults (`cnn_v3/tools/weights.js`):**
+```bash
+uv run export_cnn_v3_weights.py checkpoints/checkpoint_epoch_200.pth \
+    --output ../../workspaces/main/weights/ \
+    --html
+```
+
+`--html` base64-encodes both `.bin` files and rewrites `cnn_v3/tools/weights.js`
+so the HTML tool loads the new weights as its embedded defaults at startup.
+Use `--html-output PATH` to write to a different `weights.js` location.
+
 Output files are registered in `workspaces/main/assets.txt` as:
 ```
 WEIGHTS_CNN_V3, BINARY, weights/cnn_v3_weights.bin, "CNN v3 conv weights (f16, 3928 bytes)"
@@ -543,10 +568,12 @@ It owns:
 
 ```
 SEQUENCE 0 0 "Scene with CNN v3"
-  EFFECT + GBufferEffect prev_cnn -> gbuf_feat0 gbuf_feat1  0 60
-  EFFECT + CNNv3Effect   gbuf_feat0 gbuf_feat1 -> sink       0 60
+  EFFECT + GBufferEffect source -> gbuf_feat0 gbuf_feat1  0 60
+  EFFECT + CNNv3Effect   gbuf_feat0 gbuf_feat1 -> sink    0 60
 ```
 
+Temporal feedback (`prev_cnn`) is wired automatically by `wire_dag()` — no explicit input needed in the `.seq` file.
+
 Or direct C++:
 ```cpp
 #include "cnn_v3/src/cnn_v3_effect.h"
@@ -636,8 +663,8 @@ Do not reference them from outside the effect unless debugging.
 
 ```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build -j$(nproc)
-./build/demo
+cmake --build build -j4
+./build/demo64k
 ```
 
 ### Expected visual output
@@ -733,13 +760,14 @@ If results drift after shader edits, verify these invariants match the Python re
 
 ## 7. HTML WebGPU Tool
 
-**Location:** `cnn_v3/tools/` — three files, no build step.
+**Location:** `cnn_v3/tools/` — four files, no build step.
 
 | File | Lines | Contents |
 |------|-------|----------|
-| `index.html` | 147 | HTML + CSS |
-| `shaders.js` | 252 | WGSL shader constants, weight-offset constants |
-| `tester.js` | 540 | `CNNv3Tester` class, event wiring |
+| `index.html` | 168 | HTML + CSS |
+| `shaders.js` | 312 | WGSL shader constants, weight-offset constants |
+| `tester.js` | 913 | `CNNv3Tester` class, inference pipeline, layer viz |
+| `weights.js` | 7 | Embedded default weights (base64); auto-generated by `--html` |
 
 ### Usage
 
@@ -750,32 +778,27 @@ python3 -m http.server 8080
 # Open: http://localhost:8080/cnn_v3/tools/
 ```
 
-Or on macOS with Chrome:
+Weights are **loaded automatically at startup** from `weights.js` (embedded base64).
+If the tool is served from the repo root, it also tries to fetch the latest
+`workspaces/main/weights/*.bin` over HTTP and uses those if available.
+Use the **↺ Reload** button to re-fetch after updating weights on disk.
+
+To update the embedded defaults after a training run, use `--html` (§3):
 ```bash
-open -a "Google Chrome" --args --allow-file-access-from-files
-open cnn_v3/tools/index.html
+uv run export_cnn_v3_weights.py checkpoints/checkpoint.pth \
+    --output ../../workspaces/main/weights/ --html
 ```
 
 ### Workflow
 
-1. **Drop `cnn_v3_weights.bin`** onto the left "weights" drop zone.
-2. **Drop a PNG or video** onto the centre canvas → CNN runs immediately.
-3. _(Optional)_ **Drop `cnn_v3_film_mlp.bin`** → FiLM sliders become active.
-4. Adjust **beat_phase / beat_norm / audio_int / style_p0 / style_p1** sliders → reruns on change.
-5. Click layer buttons (**Feat · Enc0 · Enc1 · BN · Dec1 · Output**) in the right panel to inspect activations.
-6. **Save PNG** to export the current output.
+1. **Drop a PNG or video** onto the canvas → CNN runs immediately (weights pre-loaded).
+2. Adjust **beat_phase / beat_norm / audio_int / style_p0 / style_p1** sliders.
+3. Click layer buttons (**Feat · Enc0 · Enc1 · BN · Dec1 · Output**) to inspect activations.
+4. **Save PNG** to export the current output.
+5. _(Optional)_ Drop updated `.bin` files onto the left panel to override embedded weights.
 
 Keyboard: `[SPACE]` toggle original · `[D]` diff×10.
 
-### Input files
-
-| File | Format | Notes |
-|------|--------|-------|
-| `cnn_v3_weights.bin` | raw u32 (no header) | 982 u32 = 1964 f16 = ~3.9 KB |
-| `cnn_v3_film_mlp.bin` | raw f32 | 776 f32 = 3.1 KB; optional — identity FiLM used if absent |
-
-Both produced by `export_cnn_v3_weights.py` (§3).
-
 ### Texture chain
 
 | Texture | Format | Size |
@@ -816,7 +839,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 | `cnn_v3/training/pack_photo_sample.py` | Photo → zeroed-geometry sample directory |
 | `cnn_v3/training/cnn_v3_utils.py` | Dataset class, feature assembly, channel dropout, salient-point detection |
 | `cnn_v3/training/train_cnn_v3.py` | CNNv3 model definition, training loop, CLI |
-| `cnn_v3/training/export_cnn_v3_weights.py` | Checkpoint → `cnn_v3_weights.bin` + `cnn_v3_film_mlp.bin` |
+| `cnn_v3/training/export_cnn_v3_weights.py` | Checkpoint → `cnn_v3_weights.bin` + `cnn_v3_film_mlp.bin`; `--html` rewrites `weights.js` |
 | `cnn_v3/training/gen_test_vectors.py` | NumPy reference forward pass + C header generator |
 | `cnn_v3/test_vectors.h` | Compiled-in test vectors (auto-generated, do not edit) |
 | `cnn_v3/src/cnn_v3_effect.h` | C++ class, Params structs, `CNNv3FiLMParams` API |
@@ -827,6 +850,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 | `cnn_v3/tools/index.html` | HTML tool — UI shell + CSS |
 | `cnn_v3/tools/shaders.js` | HTML tool — inline WGSL shaders + weight-offset constants |
 | `cnn_v3/tools/tester.js` | HTML tool — CNNv3Tester class, inference pipeline, layer viz |
+| `cnn_v3/tools/weights.js` | HTML tool — embedded default weights (base64, auto-generated) |
 | `cnn_v2/tools/cnn_v2_test/index.html` | HTML tool reference pattern (v2) |
 
 ---
diff --git a/cnn_v3/tools/index.html b/cnn_v3/tools/index.html
index 26fee9b..6c7b406 100644
--- a/cnn_v3/tools/index.html
+++ b/cnn_v3/tools/index.html
@@ -162,6 +162,7 @@ video{display:none}
 </div>
 
 <script src="shaders.js"></script>
+<script src="weights.js"></script>
 <script src="tester.js"></script>
 </body>
 </html>
diff --git a/cnn_v3/tools/tester.js b/cnn_v3/tools/tester.js
index 0412cae..81c869d 100644
--- a/cnn_v3/tools/tester.js
+++ b/cnn_v3/tools/tester.js
@@ -52,29 +52,34 @@ class CNNv3Tester {
   async preload() {
     const base = '../../workspaces/main/weights/';
     const files = [
-      {url: base+'cnn_v3_weights.bin',  isFilm: false},
-      {url: base+'cnn_v3_film_mlp.bin', isFilm: true},
+      {url: base+'cnn_v3_weights.bin',  isFilm: false, b64: CNN_V3_WEIGHTS_B64},
+      {url: base+'cnn_v3_film_mlp.bin', isFilm: true,  b64: CNN_V3_FILM_MLP_B64},
     ];
-    for (const {url, isFilm} of files) {
+    for (const {url, isFilm, b64} of files) {
+      let buf = null;
+      const name = url.split('/').pop();
       try {
         const r = await fetch(url);
-        if (!r.ok) { this.log(`preload skip: ${url.split('/').pop()} (${r.status})`); continue; }
-        const buf = await r.arrayBuffer();
-        const name = url.split('/').pop();
-        if (isFilm) {
-          this.filmMlp = this.parseFilm(buf);
-          const el = document.getElementById('fDrop');
-          el.textContent = `✓ ${name}`; el.classList.add('ok');
-          document.getElementById('fSt').textContent = 'FiLM MLP loaded';
-          document.getElementById('fSt').style.color = '#28a745';
-        } else {
-          this.weightsU32 = this.parseWeights(buf); this.weightsBuffer = buf;
-          if (this.weightsGPU) { this.weightsGPU.destroy(); this.weightsGPU = null; }
-          const el = document.getElementById('wDrop');
-          el.textContent = `✓ ${name}`; el.classList.add('ok');
-        }
-        this.log(`Preloaded: ${name}`);
-      } catch(e) { this.log(`preload error (${url.split('/').pop()}): ${e.message}`, 'err'); }
+        if (r.ok) { buf = await r.arrayBuffer(); this.log(`Preloaded: ${name}`); }
+      } catch(_) {}
+      if (!buf) {
+        const s = atob(b64); const u = new Uint8Array(s.length);
+        for (let i = 0; i < s.length; i++) u[i] = s.charCodeAt(i);
+        buf = u.buffer;
+        this.log(`Loaded embedded: ${name}`);
+      }
+      if (isFilm) {
+        this.filmMlp = this.parseFilm(buf);
+        const el = document.getElementById('fDrop');
+        el.textContent = `✓ ${name}`; el.classList.add('ok');
+        document.getElementById('fSt').textContent = 'FiLM MLP loaded';
+        document.getElementById('fSt').style.color = '#28a745';
+      } else {
+        this.weightsU32 = this.parseWeights(buf); this.weightsBuffer = buf;
+        if (this.weightsGPU) { this.weightsGPU.destroy(); this.weightsGPU = null; }
+        const el = document.getElementById('wDrop');
+        el.textContent = `✓ ${name}`; el.classList.add('ok');
+      }
     }
     if (this.weightsU32) {
       if (this.image || this.isVideo) this.run();
diff --git a/cnn_v3/tools/weights.js b/cnn_v3/tools/weights.js
new file mode 100644
index 0000000..dde1ed4
--- /dev/null
+++ b/cnn_v3/tools/weights.js
@@ -0,0 +1,4 @@
+'use strict';
+// Auto-generated by export_cnn_v3_weights.py --html  — do not edit by hand.
+const CNN_V3_WEIGHTS_B64='ias6I32xLDG5Masbdq4qIz+xrLQcshe3Ja1drluwb7crtHi38DZ8OL02eTh4Oe44HDTzN381TpwQqDCpCiP2pjipZywPL7CjNipXJc2qwiraJoetwijzphmqfCimJRgsX6tvqeuie6cRqoMpBhvSpbUiWSMFIlqrzCnHDiSiE6zJpYshR6udJMAmdRSkqHMVq6v0J68o6SL3p/mroia3I0uqEKobrMSdOqY5LAmgACqWqjch/SpcopUq6iJkJpEs6CumqH+lYqvLqjUs0ip5oKAkdqq9CTcnfamjJgSp36TmLZAsQLHTotyn27QPs1a0L7RwtOGwerXMswizdDeQOPw4BDj+OB85ADUuNhU4r7QGsAG0b7PsrP6ynrUosfuzALX4tNi3yrbNtDu2RbhRtSy4/TTgNrE1DDV2NyE2ijIMNqIyMibho5KvrSA1olmnQy31LbSm1KZPLHgpYqytLEKsuqVeHq+sWTWQObez6ze1PLI0krIhOdgsdaEKN6C2EzOBOzQ2WLePNx8wMrTRMiW4J7LMOBcvrrq/LkWtdq1vLIIgdKqAJSAvxa0SKXsoR6/QnXosIarzLJwjxa/VKmclhabVKL4lTSo7LNopdaspqzIp3Z4oqU2pNyxpqQ6fWKvCq3cmtqksmaEgF6hfoI2fc6rSJ0SoxxpYoRGfvijcqTqmEiNpLHKa6yDtKqUsgaj5IVsnopQupcSsNabjKqkq4SuGqjWIeavyItemJhvpK2WsI6snKwKruqijqY+sZzM/NsGzjDWZOTU0oKnJNmMtI7DBMSyx4C9JOZs0UbTvMl+nd7OMJYa1z6jONRQwy7d/EQir56pzMUqvhCzONFIj8rJVJZOvU60gLiuwMyxgNSwh9rPcLpiuDazUMWGs/6iENDsw1bSUKHCvsLFKL4UtSqloIxUw57DtoHEtYayELMEiLCDlqpymv6nIqhWqIq+9sfOwH7JjtLG0HrJAsLety7RKtbm0ibSysym0ebE1r4cdSzRJNYw0LzWVN6s3ZThgOOA4m6uKLSqsuCuOM1sucrK7pp+qPLF2J8+sIR2BM5uaH7ARLzGtn6eSK90npywwK1aoZKV6rC0qyxREq6asNCmZJVGo2is9qZ2VhSpLmYosHiZ0o3WWqCJ/qSAoXiY7Kk0owJ+QK+SoR6vnptEV3apbqVglNizUoiWigqZupOKdCayWKW8r1imMqvYiyyu6pOYr8imLKd4pGyyAK0ekOagyp1Mo36/ur8q0wa+Zsrmx567Er3e0FbXLtKO0Fa/1suazq611rHWupzVoNuY12ze3OLk32DgVOac5N7a1s5G0SrVpsZO0VLXJsvGxVLQ1tB20YbWGsTawL7JDsMSwtTXBN0E3AzhuOGM4GDcVOYM5hrGVLaSmV6DHMwowdLO9Lw2wtiHHp3ms1SUyKiccY6Z6qWciQhnhqO8o1CY9rNclpKwpKoYjlyDzo/WrwiQdpcCrXiqmqh4qgZ0TJvynaCzGnUCsTyWfLL+nqSpMHYkkOaTpmian9iiRmL0q4SWtJd4n9KiXqQKmrhVtKAusHZ/KKJGqISYXJxUhXCkEmKGsVSorpK0sDKxUpygmHCA+KWqquyfzqoqosabFqi2hoCsZp7or5ilzKtsmrxmCo2is0ycBJjUsvKyzrD8o9KkGq+aqtSz4qMYlcKTuKLOo4yClIAshSKzLJIKoDqCYpxwshqwSingh55sqkSQrgqa+JH4qQSVRKBId0JUYrDWdciwRIderTKW9LGasCSyuKpKnrKriqomXayzqJn6oMh5OIUwr4R1EKOSkaqbBqnUqoJ0mK9KfXiugH8ckgyyIqsKr8ic8ojklvCYcq86nKiaoKpeooqusoQ+s3Si6oQQfvaCgK88qviqtq0ShcBhLJxKstaXCqgEYTSrrnM2goK9ft+m5NCcrHicvOjDPoWMztSiasJ4lU7U/N0A2rbSlLO426i/psy60Qrgtti61kLDcsEmzRqfYNbMbVzc/sGYnTq6LrjIsIzDgrZAqQa+/sEcpE5jkLrCuya6uqGqkabBbIWunbyzBroyw9JkIMDkp7K7NryUoiyrIKC2wqCtvrUIo8q2ArdOtkCrdrgKuRLFXMGwtiC8+r78wSrCGsIwwRy7MMPUvKbCZqmWwzSkqIdEwOyWYKQCxX621LHmwDCFhK6UwERn4LFcrOS5SrWSnEa7qqMUmnC9uin+u1jBzN6stMjaAsf24abC+MJuxVbGZNIKrZDiBsM+xbzVOLZKzrzQtOsW3rqkiMUC8PKnlwZW9TDEMpLem4DBfrRwvQ61PLnKg7q4KtiAy0TTtqWg0jTFzMn01kjGFtX4vvjVZuW0qJTPVtNIwxzNzuNMlF6uWudsZgLaQuBW0qLrNJLEweDDdplgkAa5TrAgxbKa5MyoznTp4K7Ez3zhhqZMxHTT4Mvkl37LXr0KmILF+NrIsA6oCvMexUT3Pvw69sjjowffAyLEPsEgfMZOMqmkwB5iuptgv7K8KuKyxFaBZupe2P7G2u6a2ma6ELwc0lbEeL6I0LiwKLrM0+SY1O3Q5ITdVsJ+zSTN8vOe0iCZBI8cwTjFAKo0vYiyhrCOtvLB6rAgyfTb8tHoc3KuOrycy1TSSONswxrWINDOx2bfWLFavDrH8MsY3G7QppUM7JsAuOOs8o79nozyvW6+3LLcwZSLboPKrKCtbJH+ovKt6tV0q8rKWtFut2rgSNaozfrzPvIg0HTXvJLkvViTFL4ktfjDbNlsmIbUYKXc1ka/ROE23XLVmKlQxkat9MeK0MrAtN982nrgRMa43YjIXtUU276BsuD24Izj4OXMwQLSLLl0x8zI+uJ4xp6tctGg0RDBsNyM0VrpHuRO71DEmKaq8tadjIzy0lb0krDu6QTZxsBKudbkrtIc4GjiHORQ9Bjf3Nzk6PzVQOWk8o6gHIxsuXS0tMoczyTLoJoE1kDSdOC07MzO/N5w4XCihM+c4sDJBLls4EjFVrJ4y/jScMKsxc67nqHa2h7OcsvG3TbdbtpG47rNnNOA0aDBGNnQ1KjH9N1I3/LQ/uGS8brktur+6ji3mta6pir0Ruti9qLLMLla637i7tuS7PbESrfkf2TRVKBEsgTYppricvSxdqDkgOawrKqIoHqzmqOQpGStPKkqpIiQVodasTKhUKAgq9zEhJE6vRi1PKZUrfCyZLkk2Oi5BMewsW7FBsWk0JC+BrkouNq5VqZayjiqpLauwtTDnLtKzobX7IqOmkKfUKuUr+LaWsiuyHqupNLCt9KpBL6Om1zEUM+g2fLvgtvq8YblxuSy5I7gGtLK8UD0KOaA6jqwaLbo87zE+uZ6p5Tdnscu3TTZnMd80/y1Kp7q4HDFHMIWTSjKPLQgxR7ImtPC0DrPiNpMxz7mjtPazZLIvr2QuzTimLj23wjT1phOylzpkpU+04bsNuwc5n7kau52+p7T4uku5TbF0Phw+qzF5PPM70SkuP0M+pyYCKpcxFyBerTgzCK51sSg0VCi6KampeiwHIIYpZirjKU0jKqUjH/weJKAnI14rr6sapGgsHTPsM4Kw96J2NEC0NzYpNa223zYMN2s2I7TeMxStDyfENpMx6TBTsiuzDTEHrUYoJaXBrHirNi4PNQ46obXxuGOwSilyqPY4RLVetoe0dzDtsN21ArFet4+04z7PNBU30DsLuW67QDAuvCS8BbgtMfY277XitNkzaB9WLho5kzccs2+4ZTkBKyG2lzufMGW3Aj0OprcxQDi9tjukFTeYtW4vCLyhNni8lrqLPCa4OLwPNGq9IDUotpG7p6gmuZe8HDVNukq7xLzwpJK8kDDEO2M0Z7AGOJIx6r1DJHG0W7TgOcMwr7lRNQCy6yrjKqO3p6mYLs2zKCBsKl+xPqxPongsKKQuK3MsbiwUrdsrVCgqJv8s6yq5pBAiYyRnrGylCLUcsKm3K6mKrlG00zZhNxAxrTTtIjqwjikDsKI1lCyqs24h3ahiKTa3jbCRsO22ECQCr963R7LqNOqqva5jN1st9ykBOHgf+bLKmWiwFbM1pXOxCzi/q8M1ni+ssbAwwzTgMIk3kjwCPFY9MbFXo3C09bSksnK0mK5hsCesTbTVtaS07DTEpkYoCTydOlI89K0ltYeyJSymrHusIDhZNkA2/zBFMymwBLE/rBmxzLrVuTG79661ItEiNCxTsBs2cjqIMu81ma4VN4829rDzp6YwzLtavGO6AbtIMZ63C7u1LO+5w76EuQ693Le6Mlom3LT1NIqi8rJFNtKliSjXpxefyiIOKE8fnSj0KpusFSCWqCklBiQjKeMj5ayBnYYphiUiqFsw4i8CtHU0dqL2LEY1DSl4M5g2GbZosbUwyTFFMyy027SQri+u9pXuMTmvpbDYr0Gxf6iOJxyseaWKsM4jErSosJit4qsrLHmyEjA2I+MyBCkLOJI0PzNYMT41OTRSMOwx0jUIo80wcTFzpDIsAzMFtm20ALNms8mrIalPtDexMLKCNqguODZENfQukzQ7NKMJwi7CKzyzD7bWNKUrd6SGN641bzTyMd6xjRwYrM6z67P+MUKchZvgJ8+2C6pHsh+48a8PrWm1IbRdsD2wuK/Nr8eva7J1rae2t7ObLr4dca47q6Wls6b5rGoqOS68r8mxW7boIPWvwbLToXCuwbDcNXk0MTOtMwgr8ShONF8x5TF5tp+uTLZQtYevpLRutIeW+65GpysyOzY1tZenHiiZt0m1CrUkr/8uYZIMMDU1/jKrtIUlmSsnnDw21yp6L0c44DD0L5k1MTNBMJow5jH/MbYyPC96LRI2XjUyq28mvRpBrL6l2a4GJRWWS65srz2yGraAKqSyh7F8qmGhIbLmNTo0BzPWMjkurShmNN8wpzFbtn2uL7ZOtcGu1bSltEWewK/4rI0ycDaYtB2qkZ0zt4m1b7RFsykvCCxEMQw2NDPFtGoxvq7BJas2iR4iL8k30zEJL6k14jMbLnwvDi0JLaIvHy/6KuA0bTRFI0gtqyxloqitLRjaKbYk8izCNeo1LzQyOEE3NjcMNtU35DXXLwAz4jCJOAY6njTkOEQ43zA1Ns43ITfhN8A3BDjrNGs2zTUBM7w1mTT8Nh44yTc7Nl03TDcNNc4zdTXQMgU0KjO8NIUyZTQ4OXU4TzjcOPM3xDjLOEQ5wThuMhwphTHFJO4xQDNVL0kxOy8wLzgiIas2LlEvay2YHyittiW9sto0jjFhOw==';
+const CNN_V3_FILM_MLP_B64='3JR3PmW94L1BYem+rRCuvtBlqz0Wsa49ZSGRPVixHjxR64y+EqMcvtQAMjypcGE+37fhvtL8lD7HpeW++M6cPvIjYr4f1j8+OWUovpNyqr6x5VE9cBP4POZUSzxw/bg83+JnPX5qAr8zwDK+TzKQPWkdmT5fmsC+ZnXMvi8piz3DTba+DqKNviGjvb7BAyW+y7ajvUgKzbyCNrK96745vj5/p70s9GC9i6GKvKQiCr3Z4c298O8FvxLSTb5I0Qo+HuiqvcieCT6DHGy8sd7GuwdUx7s5zA68eC27u48Lv71xkLq+HtTNvsV13b5meBE+O0eAPHpPmTzOJY088l9vPDUwrjyf0MG+NX4dvseuz77AXnI93E3uPSkEYL6XE5O+K95EPqzOVL6lAeq9yk39vCEhRr4QcIi+KhVZvogH870TRLG+acdtvvqzB7/jigO//eUMP+Lzr74MwKq80fz9vHYujD6E6K2+IwUXP+3jjr4KnVQ++F1cvvz9Qb6snIq7lurPPKbmCj6ZqnA+t5s7vgDEsj6NZaM+rq89PvmRhD7JNWM+GUGHPVz1ijziwBA+hMBIPVA+trwvHM89wFALPmqfyz1bBzs+s4BAvhyRur2zA7o+ahQPvSyNfbsT5E09+WJ9Pkl/+L1LkoQ+fqxZvjvHFz4AcIe5KRgQPqFuJj5eOrQ+Sc2WPsEWi73F3BS+lgMtP0H/AD74d8O7+PNpPt7zGz+GB3e+VmggPxhtUD7bZgA/MvQ6PiZdTL4079s+MJFfvdSUmb2MaRE+PHRXPmQl0L0a2jS+EJsUPVj5r71eclY+igMBPiCd3b1qwy++HHcjvrCyBD6wM3C9yjhpPsmzBL2gpqw8xE5qPm9A/z1jkBA++eOSPe/oHT3QBTu9P7UMPgoWXT7B9W8+Bv9MvrdjSTxANMS7xJgzPvKzlz3uup09gGQTPuom+z0AxP09XsYNvmSFbLxv+3U+1wZzvlnPCb45cGU9ujnsviDN/zwg8+69KMKqPRl+hz6ddUG+6VukvfT1ID5rWR++w0nmvd7YI7+J4RO9T5ZuvQWiCr43WRO/JIsSPkaEJL/oZAS9wH0qv+pOLT7qoni+M9oSv4Dx4TuAkyW7mEFYPZhRib3QUqQ9hPVJvrCL5TwA3gC97EwGPhSV8L3k4x2+gMNXPJaPD77Q5y29pCWUvaS0r71+GIs9ECCDPpNcYjwyNvU8fwwRv8zjb74fQzE+/BFVvg3a/L5xX0Y++qYCv3ALgb1e4SS/QA19vh8MzL3fI4q+WBK9PZaOI75gG0i+TBvOvcINGT7gwTE83Ep8vg17e760kWs+UodWPpZrdD5Ms+49sENrPsYZKj76mFK+GAtyu1AEUz7baU0+gWfbPetWFTxZCoC9SA1YvnoDlj2HIvu91GgMvhyvxz1W+D4+GApiPabLO7wA5Ko644UCvgxTGz3LnI27LQgRvrnhw7y/McS9rXb0PkKb072Wi0a+F9PdPba+Az/hZR6+/8n4PnKPfj6Vx+c+8EacPIonVb4HE9k+PWtqvRQXJ760rxc9XHonvrSjiL7Y6Wk+X78oPXDCGT67Nny++E5iPXdDHT1sqDi+B4ItvoAH3r2zzdq85XhePIZolj5J0hY+s/isPaatrb7AdNs+8r+2vd8pDD6qzMM9eouUPsOeMz3BcII+FvpZPiF1pj6WdUG+j/devalgoTypoga9c0LEPc3fur2xte29np00viIJ5bzvYS49eWp6viSBDr6IYUe9SiOpvoBpdr3doXK+wDKou59EljxBZdC9oQC+PFTUWz0YBpY+T+5gvDaGIT+RZBs84iqJPiO15z7wWw8/nFHsvdvGID+Q2FE9OojpPqxLqD02EkS+dbfNPtnUlz2Ukgi+AiD9PPMD+z2V2i49j6osvt2EcD5sotK9a/FvvWeJ1j2C5vM9cPZ0vWTyQz72hWM+X9dMPa7bQrwyUZQ9GB83PfxbHD4g9FU+dX8XPjALx7yEcb+9up8MvhaPkD3QiYc91xN6vR4cT76kCnu+Ws0VvmA9W7yv4G89ZvsgPiVaor2ok/k9BIWJPZ5+7rno6QM+fbTrvKHqZj3NOOy8MI0HPXi5VL7iUmQ+KlTCvRSdSb7/IpU8fpXYvZHMgT2K2Us+UShZPo5z/70zGWA897+RvtLUiL4dUoi9n2wGvjaqzzznj/Q9UE0DvtqEub2WgXu++SI3PoQMk76Qa7E9PZeHvEyiXD5pKze+zpu/PepKg75irr286FJiPhIvw7tmUfk9nmWhPvCF5b2dbwU+KgwYPr6Egr5wVR6+Q0mBPoJx37xmKsg7nEbsvR+DJb0fjcw8ZlI7vTQInDw5elu+z1rWvRbWw72wLds8Qa3xvciA37008EG8EFEcvpXUKz7zSxw9tTgLPtwUgL2By/Q+Zv2+PQGEa75hnzk+mOKSPumBOj7TR6k9EBtfPuuk5z4kzPo95mqtvduNIz5nbhQ8WXzxPfMPXj6bNCU+43hzPmlAxr1LbM298vepPXV0CTyGzA4+uuxMvQIuXr4wo4Y+gG/9O73WCT6+VY29QlN9Pcs5Pj60l129snY8vjA9jD6cniU+2xM+vuAR9zshZ9Q9DufPPXYA3T3mliM+hMWzvZxOX76a5wQ+z5MvvT7p0r1+YLA98dwXPoI7ED5IrOy+V2TKPfAEEr7pdiy+rWIkvu7Z9L2RqRG+QBKju/A1Fr6QulK+rIGmvkLYHT6WOQa+4hGuvRn3o7u3Xy490raePrChM75vM20+B3g0vWQsMz5wWe69eI1IPgDCszoM0eA9aChGvvWVtb2gl/49XO2APacEGT57JqQ8mm7yPQmfij7BtZ4997PLPU49RT6skDs+6cy5va3cT72YRT69OqFpPpBEET1Ba/q9pWadPG+tVj7+1T8+0OgJvqrNSDy4bZE+90RAPljRlT6BR7Q+SKUhPoFsMz0BgVU9eDulve35Oz6Aksi7mSxxvWnAFD44KjU8FoCRvlp8Mb4+d/Q4mNqnPce5yTyQvcs+hfypPc51Ar6/P+Q92RvPPQBLmjuyIsG+eEwnvikEjT7Dtq6+O61Fvu3Ohrz7FRU+FzGvvYL5nj5chL+9eQcRvsfvhz0de4k+lwiNvuAUMj7GkTg+5JKKPeASCT7EhLy7Q++5PVd2o70jiaO9HvEzPZ/UK74CcQE/7tV3vnItBbyarB8+7oEcPvcT/L0UN/8+5ODOPZ/Tpz7gIQG+O1sZPpgroj4sATw9zVYjvvjOVD59ezc+9p8aviKffD1qmzA8R7zGvaI9rr0L/6g9hefZvTgVY77bpta9LMzuvcA/eD43miY+nTOWOa53Nb684CC+rLrmvXRhVT4IuxC+DNO0PYknQr27t/U9Y0PvPRpzCz7kGdQ9ql7BPcBAiLto9qe9HrjlvXb5T7335Zy9vFgju8NHVb5fvWo+GRQnvoaXXD3Vtiw9nonVPeokEbyDXYc+YLW7PJdgAz6isls+Qk7GveS5xr1Gits+cohJvXRoeTuwqR2+3wgHP+tNBr0B4GG9HkO3PhHJaj8QvZU9lbHfPmwI7j1ZgGI/WPXrvZutJz21hbI+N+YEPezs6bvmUFY+dfZbPd/Rnz7izxK+GquBPbJHZb0HPzs+jrptPVg8Aj8wAVi9N6jgPv5Jbr6MM00+MqIxPrwAZz0oERA+fEoLvudJH71ysgY/vhOYPaEPDr7asVQ+D8quPqWcmj1ewQg/Rqc7vrgRtz5gSTc+/Km0vEqH4j3BZ6M97evKvTmcJb5Htp492/KfPsn4vj2lx069ZroOPqXGXz74XFY7cRZWPiJgdz56er4+MAZfvg6wE71r3yc7uwqFPs7Ny71CA4o+k+GLPs7aej2b4789izC8vfOUjz5D2OU+RhUgvhVKgT5ak20+yOWBPjh0Q73A3qS9qYawvbt4YT7O1dI+mQoQP0LcDr6YwiY+yi7vvVpA8b78DAi+4jt7v5U5P75FGvu9lpeTPnhZ/r7zTIg+Hx+Avj7xOD/oW5o+D35evsYRHr5BbGi+3fQnPX3iyT2Ob3s+T6bzPUKchD5O6d6+IxjNPjWTW7wa0vk+n14dvhmT5z5tW/Y+8yOQvoRtZD6JwbU93pZLP1mnjz4C1bs9VUXBPqtl7z0=';
diff --git a/cnn_v3/training/cnn_v3_utils.py b/cnn_v3/training/cnn_v3_utils.py
index bef4091..50707a2 100644
--- a/cnn_v3/training/cnn_v3_utils.py
+++ b/cnn_v3/training/cnn_v3_utils.py
@@ -286,7 +286,8 @@ class CNNv3Dataset(Dataset):
                  channel_dropout_p: float = 0.3,
                  detector: str = 'harris',
                  augment: bool = True,
-                 patch_search_window: int = 0):
+                 patch_search_window: int = 0,
+                 single_sample: str = ''):
         self.patch_size          = patch_size
         self.patches_per_image   = patches_per_image
         self.image_size          = image_size
@@ -296,16 +297,18 @@ class CNNv3Dataset(Dataset):
         self.augment             = augment
         self.patch_search_window = patch_search_window
 
-        root       = Path(dataset_dir)
-        subdir     = 'full' if input_mode == 'full' else 'simple'
-        search_dir = root / subdir
-        if not search_dir.exists():
-            search_dir = root
-
-        self.samples = sorted([
-            d for d in search_dir.iterdir()
-            if d.is_dir() and (d / 'albedo.png').exists()
-        ])
+        if single_sample:
+            self.samples = [Path(single_sample)]
+        else:
+            root       = Path(dataset_dir)
+            subdir     = 'full' if input_mode == 'full' else 'simple'
+            search_dir = root / subdir
+            if not search_dir.exists():
+                search_dir = root
+            self.samples = sorted([
+                d for d in search_dir.iterdir()
+                if d.is_dir() and (d / 'albedo.png').exists()
+            ])
         if not self.samples:
             raise RuntimeError(f"No samples found in {search_dir}")
 
diff --git a/cnn_v3/training/export_cnn_v3_weights.py b/cnn_v3/training/export_cnn_v3_weights.py
index 99f3a81..edf76e2 100644
--- a/cnn_v3/training/export_cnn_v3_weights.py
+++ b/cnn_v3/training/export_cnn_v3_weights.py
@@ -31,6 +31,7 @@ Usage
 """
 
 import argparse
+import base64
 import struct
 import sys
 from pathlib import Path
@@ -158,13 +159,40 @@ def export_weights(checkpoint_path: str, output_dir: str) -> None:
     print(f"\nDone → {out}/")
 
 
+_WEIGHTS_JS_DEFAULT = Path(__file__).parent.parent / 'tools' / 'weights.js'
+
+
+def update_weights_js(weights_bin: Path, film_mlp_bin: Path,
+                      js_path: Path = _WEIGHTS_JS_DEFAULT) -> None:
+    """Encode both .bin files as base64 and write cnn_v3/tools/weights.js."""
+    w_b64 = base64.b64encode(weights_bin.read_bytes()).decode('ascii')
+    f_b64 = base64.b64encode(film_mlp_bin.read_bytes()).decode('ascii')
+    js_path.write_text(
+        "'use strict';\n"
+        "// Auto-generated by export_cnn_v3_weights.py --html  — do not edit by hand.\n"
+        f"const CNN_V3_WEIGHTS_B64='{w_b64}';\n"
+        f"const CNN_V3_FILM_MLP_B64='{f_b64}';\n"
+    )
+    print(f"\nweights.js  →  {js_path}")
+    print(f"  CNN_V3_WEIGHTS_B64   {len(w_b64)} chars  ({weights_bin.stat().st_size} bytes)")
+    print(f"  CNN_V3_FILM_MLP_B64  {len(f_b64)} chars  ({film_mlp_bin.stat().st_size} bytes)")
+
+
 def main() -> None:
     p = argparse.ArgumentParser(description='Export CNN v3 trained weights to .bin')
     p.add_argument('checkpoint', help='Path to .pth checkpoint file')
     p.add_argument('--output', default='export',
                    help='Output directory (default: export/)')
+    p.add_argument('--html', action='store_true',
+                   help=f'Also update {_WEIGHTS_JS_DEFAULT} with base64-encoded weights')
+    p.add_argument('--html-output', default=None, metavar='PATH',
+                   help='Override default weights.js path (implies --html)')
     args = p.parse_args()
     export_weights(args.checkpoint, args.output)
+    if args.html or args.html_output:
+        out = Path(args.output)
+        js_path = Path(args.html_output) if args.html_output else _WEIGHTS_JS_DEFAULT
+        update_weights_js(out / 'cnn_v3_weights.bin', out / 'cnn_v3_film_mlp.bin', js_path)
 
 
 if __name__ == '__main__':
diff --git a/cnn_v3/training/infer_cnn_v3.py b/cnn_v3/training/infer_cnn_v3.py
new file mode 100644
index 0000000..ca1c72a
--- /dev/null
+++ b/cnn_v3/training/infer_cnn_v3.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["torch", "numpy", "pillow", "opencv-python"]
+# ///
+"""CNN v3 PyTorch inference — compare with cnn_test (WGSL/GPU output).
+
+Simple mode (single PNG):  albedo = photo, geometry channels zeroed.
+Full mode (sample dir):    loads all G-buffer files via assemble_features.
+
+Usage:
+  python3 infer_cnn_v3.py photo.png out.png --checkpoint checkpoints/ckpt.pth
+  python3 infer_cnn_v3.py sample_000/ out.png --checkpoint ckpt.pth
+  python3 infer_cnn_v3.py photo.png out.png --checkpoint ckpt.pth --identity-film
+  python3 infer_cnn_v3.py photo.png out.png --checkpoint ckpt.pth --cond 0.5 0.0 0.8 0.0 0.0
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+
+sys.path.insert(0, str(Path(__file__).parent))
+from train_cnn_v3 import CNNv3
+from cnn_v3_utils import assemble_features, load_rgb, load_rg, load_depth16, load_gray
+
+
+# ---------------------------------------------------------------------------
+# Feature loading
+# ---------------------------------------------------------------------------
+
+def load_sample_dir(sample_dir: Path) -> np.ndarray:
+    """Load all G-buffer files from a sample directory → (H,W,20) f32."""
+    return assemble_features(
+        load_rgb(sample_dir / 'albedo.png'),
+        load_rg(sample_dir / 'normal.png'),
+        load_depth16(sample_dir / 'depth.png'),
+        load_gray(sample_dir / 'matid.png'),
+        load_gray(sample_dir / 'shadow.png'),
+        load_gray(sample_dir / 'transp.png'),
+    )
+
+
+def load_simple(image_path: Path) -> np.ndarray:
+    """Photo → (H,W,20) f32 with geometry channels zeroed.
+
+    normal=(0.5,0.5) is the oct-encoded "no normal" (decodes to ~(0,0,1)).
+    shadow=1.0 (fully lit), transp=0.0 (opaque).
+    """
+    albedo = load_rgb(image_path)
+    h, w = albedo.shape[:2]
+    normal = np.full((h, w, 2), 0.5, dtype=np.float32)
+    depth  = np.zeros((h, w), dtype=np.float32)
+    matid  = np.zeros((h, w), dtype=np.float32)
+    shadow = np.ones((h, w), dtype=np.float32)
+    transp = np.zeros((h, w), dtype=np.float32)
+    return assemble_features(albedo, normal, depth, matid, shadow, transp)
+
+
+# ---------------------------------------------------------------------------
+# Inference
+# ---------------------------------------------------------------------------
+
+def pad_to_multiple(feat: np.ndarray, m: int = 4) -> tuple:
+    """Pad (H,W,C) so H and W are multiples of m. Returns (padded, (ph, pw))."""
+    h, w = feat.shape[:2]
+    ph = (m - h % m) % m
+    pw = (m - w % m) % m
+    if ph == 0 and pw == 0:
+        return feat, (0, 0)
+    return np.pad(feat, ((0, ph), (0, pw), (0, 0))), (ph, pw)
+
+
+def run_identity_film(model: CNNv3, feat: torch.Tensor) -> torch.Tensor:
+    """Forward with identity FiLM (γ=1, β=0). Matches C++ cnn_test default."""
+    c0, c1 = model.enc_channels
+    B = feat.shape[0]
+    dev = feat.device
+
+    skip0 = F.relu(model.enc0(feat))
+
+    x     = F.avg_pool2d(skip0, 2)
+    skip1 = F.relu(model.enc1(x))
+
+    x = F.relu(model.bottleneck(F.avg_pool2d(skip1, 2)))
+
+    x = F.relu(model.dec1(
+        torch.cat([F.interpolate(x, scale_factor=2, mode='nearest'), skip1], dim=1)
+    ))
+
+    x = F.relu(model.dec0(
+        torch.cat([F.interpolate(x, scale_factor=2, mode='nearest'), skip0], dim=1)
+    ))
+
+    return torch.sigmoid(x)
+
+
+# ---------------------------------------------------------------------------
+# Output helpers
+# ---------------------------------------------------------------------------
+
+def save_png(path: Path, out: np.ndarray) -> None:
+    """Save (H,W,4) f32 [0,1] RGBA as PNG."""
+    rgba8 = (np.clip(out, 0.0, 1.0) * 255.0 + 0.5).astype(np.uint8)
+    Image.fromarray(rgba8, 'RGBA').save(path)
+
+
+def print_debug_hex(out: np.ndarray, n: int = 8) -> None:
+    """Print first n pixels as hex RGBA + float values."""
+    flat = out.reshape(-1, 4)
+    for i in range(min(n, flat.shape[0])):
+        r, g, b, a = flat[i]
+        ri, gi, bi, ai = int(r*255+.5), int(g*255+.5), int(b*255+.5), int(a*255+.5)
+        print(f'  [{i}] 0x{ri:02X}{gi:02X}{bi:02X}{ai:02X}'
+              f'  ({r:.4f} {g:.4f} {b:.4f} {a:.4f})')
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+    p = argparse.ArgumentParser(description='CNN v3 PyTorch inference')
+    p.add_argument('input',  help='Input PNG or sample directory')
+    p.add_argument('output', help='Output PNG')
+    p.add_argument('--checkpoint', '-c', metavar='CKPT',
+                   help='Path to .pth checkpoint (auto-finds latest if omitted)')
+    p.add_argument('--enc-channels', default='4,8',
+                   help='Encoder channels (default: 4,8 — must match checkpoint)')
+    p.add_argument('--cond', nargs=5, type=float, metavar='F', default=[0.0]*5,
+                   help='FiLM conditioning: 5 floats (beat_phase beat_norm audio style0 style1)')
+    p.add_argument('--identity-film', action='store_true',
+                   help='Bypass FiLM MLP, use γ=1 β=0 (matches C++ cnn_test default)')
+    p.add_argument('--blend', type=float, default=1.0,
+                   help='Blend with input albedo: 0=input 1=CNN (default 1.0)')
+    p.add_argument('--debug-hex', action='store_true',
+                   help='Print first 8 output pixels as hex')
+    args = p.parse_args()
+
+    # --- Feature loading ---
+    inp = Path(args.input)
+    if inp.is_dir():
+        print(f'Mode: full  ({inp})')
+        feat = load_sample_dir(inp)
+        albedo_rgb = load_rgb(inp / 'albedo.png')
+    else:
+        print(f'Mode: simple  ({inp})')
+        feat = load_simple(inp)
+        albedo_rgb = load_rgb(inp)
+    orig_h, orig_w = feat.shape[:2]
+
+    feat_padded, (ph, pw) = pad_to_multiple(feat, 4)
+    H, W = feat_padded.shape[:2]
+    if ph or pw:
+        print(f'Padded {orig_w}×{orig_h} → {W}×{H}')
+    else:
+        print(f'Resolution: {W}×{H}')
+
+    # --- Load checkpoint ---
+    if args.checkpoint:
+        ckpt_path = Path(args.checkpoint)
+    else:
+        ckpts = sorted(Path('checkpoints').glob('checkpoint_epoch_*.pth'),
+                       key=lambda f: int(f.stem.split('_')[-1]))
+        if not ckpts:
+            print('Error: no checkpoint found; use --checkpoint', file=sys.stderr)
+            sys.exit(1)
+        ckpt_path = ckpts[-1]
+    print(f'Checkpoint: {ckpt_path}')
+
+    ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
+    cfg  = ckpt.get('config', {})
+    enc_channels  = cfg.get('enc_channels',  [int(c) for c in args.enc_channels.split(',')])
+    film_cond_dim = cfg.get('film_cond_dim', 5)
+    print(f'Architecture: enc={enc_channels}  film_cond_dim={film_cond_dim}')
+
+    model = CNNv3(enc_channels=enc_channels, film_cond_dim=film_cond_dim)
+    model.load_state_dict(ckpt['model_state_dict'])
+    model.eval()
+
+    # --- Inference ---
+    feat_t = torch.from_numpy(feat_padded).permute(2, 0, 1).unsqueeze(0)  # (1,20,H,W)
+    cond_t = torch.tensor([args.cond], dtype=torch.float32)               # (1,5)
+
+    with torch.no_grad():
+        if args.identity_film:
+            print('FiLM: identity (γ=1, β=0)')
+            out_t = run_identity_film(model, feat_t)
+        else:
+            print(f'FiLM cond: {args.cond}')
+            out_t = model(feat_t, cond_t)
+
+    # (1,4,H,W) → crop padding → (orig_h, orig_w, 4)
+    out = out_t[0].permute(1, 2, 0).numpy()[:orig_h, :orig_w, :]
+
+    # Optional blend with albedo
+    if args.blend < 1.0:
+        h_in, w_in = albedo_rgb.shape[:2]
+        ab = albedo_rgb[:orig_h, :orig_w]
+        ones = np.ones((orig_h, orig_w, 1), dtype=np.float32)
+        src_rgba = np.concatenate([ab, ones], axis=-1)
+        out = src_rgba * (1.0 - args.blend) + out * args.blend
+
+    # --- Save ---
+    out_path = Path(args.output)
+    save_png(out_path, out)
+    print(f'Saved: {out_path}')
+
+    if args.debug_hex:
+        print('First 8 output pixels (RGBA):')
+        print_debug_hex(out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cnn_v3/training/train_cnn_v3.py b/cnn_v3/training/train_cnn_v3.py
index de10d6a..31cfd9d 100644
--- a/cnn_v3/training/train_cnn_v3.py
+++ b/cnn_v3/training/train_cnn_v3.py
@@ -104,6 +104,10 @@ def train(args):
     enc_channels = [int(c) for c in args.enc_channels.split(',')]
     print(f"Device: {device}")
 
+    if args.single_sample:
+        args.full_image  = True
+        args.batch_size  = 1
+
     dataset = CNNv3Dataset(
         dataset_dir=args.input,
         input_mode=args.input_mode,
@@ -115,6 +119,7 @@ def train(args):
         detector=args.detector,
         augment=True,
         patch_search_window=args.patch_search_window,
+        single_sample=args.single_sample,
     )
     loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True,
                         num_workers=0, drop_last=False)
@@ -222,6 +227,8 @@ def main():
     p = argparse.ArgumentParser(description='Train CNN v3 (U-Net + FiLM)')
 
     # Dataset
+    p.add_argument('--single-sample', default='', metavar='DIR',
+                   help='Train on a single sample directory; implies --full-image and --batch-size 1')
     p.add_argument('--input', default='training/dataset',
                    help='Dataset root (contains full/ or simple/ subdirs)')
     p.add_argument('--input-mode', default='simple', choices=['simple', 'full'],
diff --git a/tools/cnn_test.cc b/tools/cnn_test.cc
index e5e2d26..beeef8f 100644
--- a/tools/cnn_test.cc
+++ b/tools/cnn_test.cc
@@ -1,21 +1,16 @@
-// CNN shader testing tool for offline validation
-// Tests trained CNN shaders on input PNG with GPU readback
+// CNN v3 shader testing tool — offline WGSL inference for Python parity checks.
+// Loads an input PNG (or sample directory), packs 20-channel features, runs the
+// CNNv3Effect (5 compute passes), and saves the RGBA16Float output as PNG.
 
 #if defined(STRIP_ALL)
 #error "cnn_test requires STRIP_ALL=OFF (tool builds only)"
 #endif
 
-#include "effects/shaders.h"
+#include "cnn_v3_effect.h"
 #include "generated/assets.h"
-#include "gpu/bind_group_builder.h"
 #include "gpu/gpu.h"
-#include "gpu/pipeline_builder.h"
-#include "gpu/post_process_helper.h"
-#include "gpu/sampler_cache.h"
+#include "gpu/sequence.h"
 #include "gpu/shader_composer.h"
-#include "gpu/texture_readback.h"
-#include "platform/platform.h"
-#include "tests/common/offscreen_render_target.h"
 #include "tests/common/webgpu_test_fixture.h"
 #include "util/asset_manager.h"
 #include "util/mini_math.h"
@@ -27,1551 +22,638 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <string>
 #include <vector>
 
-// CNN v1 structures
-struct CNNv1LayerParams {
-  int layer_index;
-  float blend_amount;
-  float _pad[2];
-};
-static_assert(sizeof(CNNv1LayerParams) == 16);
+// ---------------------------------------------------------------------------
+// F16 / pack helpers (match WGSL pack2x16float / pack4x8unorm)
+// ---------------------------------------------------------------------------
 
-// Helper to get asset string or empty string
-static const char* SafeGetAsset(AssetId id) {
-  const uint8_t* data = GetAsset(id);
-  return data ? (const char*)data : "";
+static uint16_t f32_to_f16(float f) {
+  uint32_t b;
+  memcpy(&b, &f, 4);
+  uint32_t sign = (b >> 16) & 0x8000u;
+  int32_t  exp  = (int32_t)((b >> 23) & 0xFFu) - 127 + 15;
+  uint32_t mant = b & 0x7FFFFFu;
+  if (exp <= 0)  return (uint16_t)sign;
+  if (exp >= 31) return (uint16_t)(sign | 0x7C00u);
+  return (uint16_t)(sign | ((uint32_t)exp << 10) | (mant >> 13));
 }
 
-// Command-line arguments
-struct Args {
-  const char* input_path = nullptr;
-  const char* output_path = nullptr;
-  float blend = 1.0f;
-  bool output_png = true; // Default to PNG
-  const char* save_intermediates = nullptr;
-  int num_layers = 3;                 // Default to 3 layers
-  bool debug_hex = false;             // Print first 8 pixels as hex
-  int cnn_version = 1;                // 1=CNNEffect, 2=CNNv2Effect
-  const char* weights_path = nullptr; // Optional .bin weights file
-  bool cnn_version_explicit =
-      false; // Track if --cnn-version was explicitly set
-};
-
-// Parse command-line arguments
-static bool parse_args(int argc, char** argv, Args* args) {
-  if (argc < 3) {
-    return false;
-  }
-
-  args->input_path = argv[1];
-  args->output_path = argv[2];
-
-  for (int i = 3; i < argc; ++i) {
-    if (strcmp(argv[i], "--blend") == 0 && i + 1 < argc) {
-      args->blend = atof(argv[++i]);
-      if (args->blend < 0.0f || args->blend > 1.0f) {
-        fprintf(stderr, "Error: blend must be in range [0.0, 1.0]\n");
-        return false;
-      }
-    } else if (strcmp(argv[i], "--format") == 0 && i + 1 < argc) {
-      ++i;
-      if (strcmp(argv[i], "ppm") == 0) {
-        args->output_png = false;
-      } else if (strcmp(argv[i], "png") == 0) {
-        args->output_png = true;
-      } else {
-        fprintf(stderr, "Error: unknown format '%s' (use 'png' or 'ppm')\n",
-                argv[i]);
-        return false;
-      }
-    } else if (strcmp(argv[i], "--save-intermediates") == 0 && i + 1 < argc) {
-      args->save_intermediates = argv[++i];
-    } else if (strcmp(argv[i], "--layers") == 0 && i + 1 < argc) {
-      args->num_layers = atoi(argv[++i]);
-      if (args->num_layers < 1 || args->num_layers > 10) {
-        fprintf(stderr, "Error: layers must be in range [1, 10]\n");
-        return false;
-      }
-    } else if (strcmp(argv[i], "--debug-hex") == 0) {
-      args->debug_hex = true;
-    } else if (strcmp(argv[i], "--cnn-version") == 0 && i + 1 < argc) {
-      args->cnn_version = atoi(argv[++i]);
-      args->cnn_version_explicit = true;
-      if (args->cnn_version < 1 || args->cnn_version > 2) {
-        fprintf(stderr, "Error: cnn-version must be 1 or 2\n");
-        return false;
-      }
-    } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) {
-      args->weights_path = argv[++i];
-    } else if (strcmp(argv[i], "--help") == 0) {
-      return false;
-    } else {
-      fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
-      return false;
-    }
-  }
-
-  // Force CNN v2 when --weights is specified
-  if (args->weights_path) {
-    if (args->cnn_version_explicit && args->cnn_version != 2) {
-      fprintf(stderr,
-              "WARNING: --cnn-version %d ignored (--weights forces CNN v2)\n",
-              args->cnn_version);
-    }
-    args->cnn_version = 2;
-
-    // Warn if --layers was specified (binary file config takes precedence)
-    if (args->num_layers != 3) { // 3 is the default
-      fprintf(stderr,
-              "WARNING: --layers %d ignored (--weights loads layer config from "
-              ".bin)\n",
-              args->num_layers);
-    }
-  }
-
-  return true;
+// Low 16 bits = a, high 16 bits = b (matches WGSL pack2x16float(vec2f(a,b)))
+static uint32_t pack2x16f(float a, float b) {
+  return (uint32_t)f32_to_f16(a) | ((uint32_t)f32_to_f16(b) << 16);
 }
 
-// Print usage
-static void print_usage(const char* prog) {
-  fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog);
-  fprintf(stderr, "\nOPTIONS:\n");
-  fprintf(stderr,
-          "  --blend F                Final blend amount (0.0-1.0, default: "
-          "1.0)\n");
-  fprintf(stderr, "  --format ppm|png         Output format (default: png)\n");
-  fprintf(stderr,
-          "  --layers N               Number of CNN layers (1-10, default: 3, "
-          "ignored with --weights)\n");
-  fprintf(stderr,
-          "  --save-intermediates DIR Save intermediate layers to directory\n");
-  fprintf(stderr,
-          "  --debug-hex              Print first 8 pixels as hex (debug)\n");
-  fprintf(stderr,
-          "  --cnn-version N          CNN version: 1 (default) or 2 (ignored "
-          "with --weights)\n");
-  fprintf(stderr,
-          "  --weights PATH           Load weights from .bin (forces CNN v2, "
-          "overrides layer config)\n");
-  fprintf(stderr, "  --help                   Show this help\n");
+// RGBA as u8 packed into u32 (matches WGSL pack4x8unorm)
+static uint32_t pack4x8u(float a, float b, float c, float d) {
+  auto u8 = [](float v) -> uint32_t {
+    int i = (int)(v * 255.0f + 0.5f);
+    if (i < 0) i = 0;
+    if (i > 255) i = 255;
+    return (uint32_t)i;
+  };
+  return u8(a) | (u8(b) << 8) | (u8(c) << 16) | (u8(d) << 24);
 }
 
-// Load PNG and upload to GPU texture
-static WGPUTexture load_texture(WGPUDevice device, WGPUQueue queue,
-                                const char* path, int* out_width,
-                                int* out_height) {
-  int width, height, channels;
-  uint8_t* data = stbi_load(path, &width, &height, &channels, 4);
-  if (!data) {
-    fprintf(stderr, "Error: failed to load image '%s'\n", path);
-    return nullptr;
-  }
-
-  *out_width = width;
-  *out_height = height;
+// ---------------------------------------------------------------------------
+// Oct-decode [0,1] → unit normal (matches Python cnn_v3_utils.oct_decode)
+// ---------------------------------------------------------------------------
 
-  // Create texture
-  const WGPUTextureDescriptor texture_desc = {
-      .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst |
-               WGPUTextureUsage_RenderAttachment,
-      .dimension = WGPUTextureDimension_2D,
-      .size = {(uint32_t)(width), (uint32_t)(height), 1},
-      .format = WGPUTextureFormat_BGRA8Unorm,
-      .mipLevelCount = 1,
-      .sampleCount = 1,
-  };
-  WGPUTexture texture = wgpuDeviceCreateTexture(device, &texture_desc);
-  if (!texture) {
-    fprintf(stderr, "Error: failed to create texture\n");
-    stbi_image_free(data);
-    return nullptr;
+static void oct_decode_01(float nx01, float ny01,
+                           float* out_x, float* out_y, float* out_z) {
+  float fx = nx01 * 2.0f - 1.0f;
+  float fy = ny01 * 2.0f - 1.0f;
+  float fz = 1.0f - fabsf(fx) - fabsf(fy);
+  if (fz < 0.0f) {
+    float sx = fx >= 0.0f ? 1.0f : -1.0f;
+    float sy = fy >= 0.0f ? 1.0f : -1.0f;
+    fx = (1.0f - fabsf(fy)) * sx;
+    fy = (1.0f - fabsf(fx)) * sy;
   }
+  float len = sqrtf(fx*fx + fy*fy + fz*fz);
+  if (len < 1e-8f) len = 1e-8f;
+  *out_x = fx / len;
+  *out_y = fy / len;
+  *out_z = fz / len;
+}
 
-  // Convert RGBA → BGRA
-  std::vector<uint8_t> bgra_data(width * height * 4);
-  for (int i = 0; i < width * height; ++i) {
-    bgra_data[i * 4 + 0] = data[i * 4 + 2]; // B
-    bgra_data[i * 4 + 1] = data[i * 4 + 1]; // G
-    bgra_data[i * 4 + 2] = data[i * 4 + 0]; // R
-    bgra_data[i * 4 + 3] = data[i * 4 + 3]; // A
-  }
+// ---------------------------------------------------------------------------
+// Mip helpers — matching Python pyrdown + nearest-upsample
+// ---------------------------------------------------------------------------
 
-  // Upload to GPU
-  const WGPUTexelCopyTextureInfo dst = {.texture = texture, .mipLevel = 0};
-  const WGPUTexelCopyBufferLayout layout = {
-      .bytesPerRow = (uint32_t)(width * 4), .rowsPerImage = (uint32_t)(height)};
-  const WGPUExtent3D size = {(uint32_t)(width), (uint32_t)(height), 1};
-  wgpuQueueWriteTexture(queue, &dst, bgra_data.data(), bgra_data.size(),
-                        &layout, &size);
+// Compute mip1 and mip2 for each pixel using the Python convention:
+//   mip1_small[y2][x2] = avg(rgb[2y2..2y2+1][2x2..2x2+1])   (half-res)
+//   mip2_small[y4][x4] = avg(mip1[2y4..2y4+1][2x4..2x4+1])  (quarter-res)
+//   Nearest upsample: mip1[y][x] = mip1_small[y/2][x/2], etc.
+// Output: mip1_out and mip2_out are (H*W*3) float arrays in row-major order.
 
-  stbi_image_free(data);
-  return texture;
-}
+static void compute_mips(const float* rgb, int w, int h,
+                          std::vector<float>& mip1_out,
+                          std::vector<float>& mip2_out) {
+  const int w2 = w / 2, h2 = h / 2;
+  const int w4 = w / 4, h4 = h / 4;
 
-// Load PNG alpha channel as depth texture (or 1.0 if no alpha)
-static WGPUTexture load_depth_from_alpha(WGPUDevice device, WGPUQueue queue,
-                                         const char* path, int width,
-                                         int height) {
-  int w, h, channels;
-  uint8_t* data = stbi_load(path, &w, &h, &channels, 4);
-  if (!data || w != width || h != height) {
-    fprintf(stderr, "Error: failed to load depth from '%s'\n", path);
-    if (data)
-      stbi_image_free(data);
-    return nullptr;
+  std::vector<float> m1(w2 * h2 * 3);
+  for (int y2 = 0; y2 < h2; ++y2) {
+    for (int x2 = 0; x2 < w2; ++x2) {
+      for (int c = 0; c < 3; ++c) {
+        int y0 = y2 * 2, x0 = x2 * 2;
+        float v = rgb[(y0   * w + x0  ) * 3 + c]
+                + rgb[(y0   * w + x0+1) * 3 + c]
+                + rgb[((y0+1) * w + x0  ) * 3 + c]
+                + rgb[((y0+1) * w + x0+1) * 3 + c];
+        m1[(y2 * w2 + x2) * 3 + c] = v * 0.25f;
+      }
+    }
   }
 
-  // Extract alpha channel (or use 1.0 if original was RGB)
-  std::vector<float> depth_data(width * height);
-  bool has_alpha = (channels == 4);
-  for (int i = 0; i < width * height; ++i) {
-    // Alpha is in data[i*4+3] (0-255), convert to float [0, 1]
-    // If no alpha channel, default to 1.0 (far plane)
-    depth_data[i] = has_alpha ? (data[i * 4 + 3] / 255.0f) : 1.0f;
+  std::vector<float> m2(w4 * h4 * 3);
+  for (int y4 = 0; y4 < h4; ++y4) {
+    for (int x4 = 0; x4 < w4; ++x4) {
+      for (int c = 0; c < 3; ++c) {
+        int y0 = y4 * 2, x0 = x4 * 2;
+        float v = m1[(y0   * w2 + x0  ) * 3 + c]
+                + m1[(y0   * w2 + x0+1) * 3 + c]
+                + m1[((y0+1) * w2 + x0  ) * 3 + c]
+                + m1[((y0+1) * w2 + x0+1) * 3 + c];
+        m2[(y4 * w4 + x4) * 3 + c] = v * 0.25f;
+      }
+    }
   }
-  stbi_image_free(data);
 
-  // Create R32Float depth texture
-  const WGPUTextureDescriptor depth_desc = {
-      .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst,
-      .dimension = WGPUTextureDimension_2D,
-      .size = {(uint32_t)(width), (uint32_t)(height), 1},
-      .format = WGPUTextureFormat_R32Float,
-      .mipLevelCount = 1,
-      .sampleCount = 1,
-  };
-  WGPUTexture depth_texture = wgpuDeviceCreateTexture(device, &depth_desc);
-  if (!depth_texture) {
-    fprintf(stderr, "Error: failed to create depth texture\n");
-    return nullptr;
+  // Nearest upsample to full-res
+  mip1_out.resize(w * h * 3);
+  mip2_out.resize(w * h * 3);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int i = (y * w + x) * 3;
+      int i1 = ((y/2) * w2 + (x/2)) * 3;
+      int i2 = ((y/4) * w4 + (x/4)) * 3;
+      mip1_out[i  ] = (y/2 < h2 && x/2 < w2) ? m1[i1  ] : 0.0f;
+      mip1_out[i+1] = (y/2 < h2 && x/2 < w2) ? m1[i1+1] : 0.0f;
+      mip1_out[i+2] = (y/2 < h2 && x/2 < w2) ? m1[i1+2] : 0.0f;
+      mip2_out[i  ] = (y/4 < h4 && x/4 < w4) ? m2[i2  ] : 0.0f;
+      mip2_out[i+1] = (y/4 < h4 && x/4 < w4) ? m2[i2+1] : 0.0f;
+      mip2_out[i+2] = (y/4 < h4 && x/4 < w4) ? m2[i2+2] : 0.0f;
+    }
   }
-
-  // Write depth data
-  const WGPUTexelCopyTextureInfo dst = {.texture = depth_texture,
-                                        .mipLevel = 0};
-  const WGPUTexelCopyBufferLayout layout = {
-      .bytesPerRow = (uint32_t)(width * sizeof(float)),
-      .rowsPerImage = (uint32_t)(height)};
-  const WGPUExtent3D size = {(uint32_t)(width), (uint32_t)(height), 1};
-  wgpuQueueWriteTexture(queue, &dst, depth_data.data(),
-                        depth_data.size() * sizeof(float), &layout, &size);
-
-  printf("Loaded depth from alpha: %dx%d (%s alpha)\n", width, height,
-         has_alpha ? "has" : "no");
-
-  return depth_texture;
 }
 
-// Create CNN render pipeline (5 bindings)
-// Takes both intermediate format (RGBA16Float) and final format (BGRA8Unorm)
-static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
-                                              WGPUTextureFormat format,
-                                              bool is_final_layer) {
-  const char* shader_code = SafeGetAsset(AssetId::ASSET_SHADER_CNN_LAYER);
-
-  // Debug: check if shader loaded
-  if (!shader_code || shader_code[0] == '\0') {
-    fprintf(stderr, "ERROR: CNN shader asset not loaded!\n");
-    return nullptr;
-  }
-  printf("Loaded CNN shader: %zu bytes\n", strlen(shader_code));
-
-  WGPUBindGroupLayout bgl =
-      BindGroupLayoutBuilder()
-          .sampler(0, WGPUShaderStage_Fragment)
-          .texture(1, WGPUShaderStage_Fragment)
-          .uniform(2, WGPUShaderStage_Vertex | WGPUShaderStage_Fragment)
-          .uniform(3, WGPUShaderStage_Fragment)
-          .texture(4, WGPUShaderStage_Fragment) // Original input
-          .build(device);
+// ---------------------------------------------------------------------------
+// Feature packing: RGB float arrays → feat_tex0 / feat_tex1 (rgba32uint)
+//
+// feat_tex0 (4 u32, f16 pairs — matches load_feat in cnn_v3_enc0.wgsl):
+//   [0] albedo.r | albedo.g
+//   [1] albedo.b | normal.x  (oct, [0,1] — training format)
+//   [2] normal.y | depth
+//   [3] dzdx     | dzdy
+//
+// feat_tex1 (4 u32, u8norm — channel order from cnn_v3_enc0.wgsl load_feat):
+//   [0] mat_id | prev.r | prev.g | prev.b
+//   [1] mip1.r | mip1.g | mip1.b | mip2.r
+//   [2] mip2.g | mip2.b | dif    | transp
+//   [3] 0
+//
+// Note: normal.xy stored in [0,1] (training format), NOT remapped to [-1,1]
+//       like gbuf_pack.wgsl does at runtime. This matches infer_cnn_v3.py.
+// ---------------------------------------------------------------------------
 
-  // Use appropriate format: RGBA16Float for intermediate, BGRA8Unorm for final
-  WGPUTextureFormat output_format = is_final_layer
-                                        ? WGPUTextureFormat_BGRA8Unorm
-                                        : WGPUTextureFormat_RGBA16Float;
+struct FeatureImages {
+  int w, h;
+  std::vector<float> albedo;   // w*h*3 [0,1]
+  std::vector<float> normal;   // w*h*2 [0,1] oct-encoded
+  std::vector<float> depth;    // w*h   [0,1]
+  std::vector<float> matid;    // w*h   [0,1]
+  std::vector<float> shadow;   // w*h   [0,1]
+  std::vector<float> transp;   // w*h   [0,1]
+};
 
-  WGPURenderPipeline pipeline =
-      RenderPipelineBuilder(device)
-          .shader(shader_code) // compose=true by default
-          .bind_group_layout(bgl)
-          .format(output_format)
-          .build();
+static void pack_features(const FeatureImages& img,
+                           std::vector<uint32_t>& feat0,  // w*h*4 u32
+                           std::vector<uint32_t>& feat1)  // w*h*4 u32
+{
+  const int W = img.w, H = img.h;
+  feat0.resize(W * H * 4);
+  feat1.resize(W * H * 4);
 
-  wgpuBindGroupLayoutRelease(bgl);
-  return pipeline;
-}
+  std::vector<float> mip1, mip2;
+  compute_mips(img.albedo.data(), W, H, mip1, mip2);
 
-// Begin render pass with clear
-static WGPURenderPassEncoder begin_render_pass(WGPUCommandEncoder encoder,
-                                               WGPUTextureView view) {
-  const WGPURenderPassColorAttachment color_attachment = {
-      .view = view,
-      .depthSlice = WGPU_DEPTH_SLICE_UNDEFINED,
-      .loadOp = WGPULoadOp_Clear,
-      .storeOp = WGPUStoreOp_Store,
-      .clearValue = {0.0f, 0.0f, 0.0f, 1.0f},
-  };
-
-  const WGPURenderPassDescriptor pass_desc = {
-      .colorAttachmentCount = 1,
-      .colorAttachments = &color_attachment,
-  };
+  static const float KEY_X = 0.408f, KEY_Y = 0.816f, KEY_Z = 0.408f;
 
-  return wgpuCommandEncoderBeginRenderPass(encoder, &pass_desc);
-}
+  for (int y = 0; y < H; ++y) {
+    for (int x = 0; x < W; ++x) {
+      const int pi  = y * W + x;
+      const int i3  = pi * 3;
+      const int i4  = pi * 4;
 
-// Save PNG output
-static bool save_png(const char* path, const std::vector<uint8_t>& pixels,
-                     int width, int height) {
-  // Convert BGRA → RGBA
-  std::vector<uint8_t> rgba(width * height * 4);
-  for (int i = 0; i < width * height; ++i) {
-    rgba[i * 4 + 0] = pixels[i * 4 + 2]; // R
-    rgba[i * 4 + 1] = pixels[i * 4 + 1]; // G
-    rgba[i * 4 + 2] = pixels[i * 4 + 0]; // B
-    rgba[i * 4 + 3] = pixels[i * 4 + 3]; // A
-  }
+      float ar = img.albedo[i3  ];
+      float ag = img.albedo[i3+1];
+      float ab = img.albedo[i3+2];
 
-  if (!stbi_write_png(path, width, height, 4, rgba.data(), width * 4)) {
-    fprintf(stderr, "Error: failed to write PNG '%s'\n", path);
-    return false;
-  }
+      float nx = img.normal[pi * 2    ];  // [0,1]
+      float ny = img.normal[pi * 2 + 1];  // [0,1]
 
-  return true;
-}
+      float d = img.depth[pi];
 
-// Create horizontal grayscale composite of layer outputs
-// Each layer is already 4x wide (showing 4 channels), stack them vertically
-static bool save_layer_composite(const char* dir, int width, int height,
-                                 int num_layers) {
-  // Each layer PNG is already 4x wide with 4 channels side-by-side
-  int layer_width = width * 4;
+      // Central finite difference depth gradient
+      int xm = (x > 0)   ? x-1 : 0;
+      int xp = (x < W-1) ? x+1 : W-1;
+      int ym = (y > 0)   ? y-1 : 0;
+      int yp = (y < H-1) ? y+1 : H-1;
+      float dzdx = (img.depth[y   * W + xp] - img.depth[y   * W + xm]) * 0.5f;
+      float dzdy = (img.depth[yp  * W + x ] - img.depth[ym  * W + x ]) * 0.5f;
 
-  // Load all layer images (they're already grayscale)
-  std::vector<std::vector<uint8_t>> layers(num_layers);
-  for (int i = 0; i < num_layers; ++i) {
-    char path[512];
-    snprintf(path, sizeof(path), "%s/layer_%d.png", dir, i);
+      float mat  = img.matid[pi];
+      float shad = img.shadow[pi];
+      float trp  = img.transp[pi];
 
-    int w, h, channels;
-    uint8_t* data = stbi_load(path, &w, &h, &channels, 1); // Load as grayscale
-    if (!data || w != layer_width || h != height) {
-      if (data)
-        stbi_image_free(data);
-      fprintf(stderr,
-              "Warning: failed to load layer %d for composite (expected %dx%d, "
-              "got %dx%d)\n",
-              i, layer_width, height, w, h);
-      return false;
-    }
+      // Diffuse = max(0, dot(oct_decode(normal), KEY_LIGHT)) * shadow
+      float n3x, n3y, n3z;
+      oct_decode_01(nx, ny, &n3x, &n3y, &n3z);
+      float dif = fmaxf(0.0f, n3x*KEY_X + n3y*KEY_Y + n3z*KEY_Z) * shad;
 
-    layers[i].assign(data, data + (layer_width * height));
-    stbi_image_free(data);
-  }
+      float m1r = mip1[i3  ], m1g = mip1[i3+1], m1b = mip1[i3+2];
+      float m2r = mip2[i3  ], m2g = mip2[i3+1], m2b = mip2[i3+2];
 
-  // Stack layers vertically
-  int composite_height = height * num_layers;
-  std::vector<uint8_t> composite(layer_width * composite_height);
+      // prev.rgb = 0 (no temporal history)
+      feat0[i4  ] = pack2x16f(ar,  ag);
+      feat0[i4+1] = pack2x16f(ab,  nx);
+      feat0[i4+2] = pack2x16f(ny,  d );
+      feat0[i4+3] = pack2x16f(dzdx, dzdy);
 
-  for (int layer = 0; layer < num_layers; ++layer) {
-    for (int y = 0; y < height; ++y) {
-      int src_row_offset = y * layer_width;
-      int dst_row_offset = (layer * height + y) * layer_width;
-      memcpy(&composite[dst_row_offset], &layers[layer][src_row_offset],
-             layer_width);
+      feat1[i4  ] = pack4x8u(mat, 0.0f, 0.0f, 0.0f);  // mat_id, prev.rgb=0
+      feat1[i4+1] = pack4x8u(m1r, m1g, m1b, m2r);
+      feat1[i4+2] = pack4x8u(m2g, m2b, dif, trp);
+      feat1[i4+3] = 0u;
     }
   }
-
-  // Save as grayscale PNG (stacked vertically)
-  char composite_path[512];
-  snprintf(composite_path, sizeof(composite_path), "%s/layers_composite.png",
-           dir);
-  if (!stbi_write_png(composite_path, layer_width, composite_height, 1,
-                      composite.data(), layer_width)) {
-    fprintf(stderr, "Error: failed to write composite PNG\n");
-    return false;
-  }
-
-  printf("Saved layer composite to '%s' (%dx%d, 4 layers stacked vertically)\n",
-         composite_path, layer_width, composite_height);
-  return true;
 }
 
-// Save PPM output (fallback)
-static bool save_ppm(const char* path, const std::vector<uint8_t>& pixels,
-                     int width, int height) {
-  FILE* f = fopen(path, "wb");
-  if (!f) {
-    fprintf(stderr, "Error: failed to open '%s' for writing\n", path);
-    return false;
-  }
-
-  fprintf(f, "P6\n%d %d\n255\n", width, height);
-  for (int i = 0; i < width * height; ++i) {
-    const uint8_t rgb[3] = {pixels[i * 4 + 2],  // R
-                            pixels[i * 4 + 1],  // G
-                            pixels[i * 4 + 0]}; // B
-    fwrite(rgb, 1, 3, f);
-  }
+// ---------------------------------------------------------------------------
+// GPU texture helpers
+// ---------------------------------------------------------------------------
 
-  fclose(f);
-  return true;
+static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) {
+  WGPUTextureDescriptor d = {};
+  d.format        = WGPUTextureFormat_RGBA32Uint;
+  d.usage         = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst;
+  d.dimension     = WGPUTextureDimension_2D;
+  d.size          = {(uint32_t)W, (uint32_t)H, 1};
+  d.mipLevelCount = 1;
+  d.sampleCount   = 1;
+  return wgpuDeviceCreateTexture(dev, &d);
 }
 
-// CNN v2 structures (matching CNNv2Effect)
-struct CNNv2LayerInfo {
-  uint32_t kernel_size;
-  uint32_t in_channels;
-  uint32_t out_channels;
-  uint32_t weight_offset;
-  uint32_t weight_count;
-};
-
-struct CNNv2LayerParams {
-  uint32_t kernel_size;
-  uint32_t in_channels;
-  uint32_t out_channels;
-  uint32_t weight_offset;
-  uint32_t is_output_layer;
-  float blend_amount;
-  uint32_t is_layer_0;
-};
-
-struct CNNv2StaticFeatureParams {
-  uint32_t mip_level;
-  uint32_t padding[3];
-};
-
-// Convert RGBA32Uint (packed f16) texture to BGRA8Unorm
-static std::vector<uint8_t>
-readback_rgba32uint_to_bgra8(WGPUDevice device, WGPUQueue queue,
-                             WGPUTexture texture, int width, int height) {
-  // Create staging buffer
-  const uint32_t bytes_per_row = width * 16; // 4×u32 per pixel
-  const uint32_t padded_bytes_per_row = (bytes_per_row + 255) & ~255;
-  const size_t buffer_size = padded_bytes_per_row * height;
-
-  WGPUBufferDescriptor buffer_desc = {};
-  buffer_desc.size = buffer_size;
-  buffer_desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
-  buffer_desc.mappedAtCreation = false;
+static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) {
+  WGPUTextureDescriptor d = {};
+  d.format        = WGPUTextureFormat_RGBA16Float;
+  d.usage         = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc;
+  d.dimension     = WGPUTextureDimension_2D;
+  d.size          = {(uint32_t)W, (uint32_t)H, 1};
+  d.mipLevelCount = 1;
+  d.sampleCount   = 1;
+  return wgpuDeviceCreateTexture(dev, &d);
+}
 
-  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) {
+  WGPUTextureViewDescriptor d = {};
+  d.format          = fmt;
+  d.dimension       = WGPUTextureViewDimension_2D;
+  d.mipLevelCount   = 1;
+  d.arrayLayerCount = 1;
+  return wgpuTextureCreateView(tex, &d);
+}
 
-  // Copy texture to buffer
-  WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+static void upload_tex(WGPUQueue queue, WGPUTexture tex,
+                        const uint32_t* data, int W, int H) {
+  WGPUTexelCopyTextureInfo dst = {};
+  dst.texture = tex;
+  WGPUTexelCopyBufferLayout layout = {};
+  layout.bytesPerRow  = (uint32_t)(W * 16);
+  layout.rowsPerImage = (uint32_t)H;
+  WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1};
+  wgpuQueueWriteTexture(queue, &dst, data, (size_t)(W * H * 16), &layout, &ext);
+}
 
-  WGPUTexelCopyTextureInfo src = {};
-  src.texture = texture;
-  src.mipLevel = 0;
+// ---------------------------------------------------------------------------
+// RGBA16Float readback
+// ---------------------------------------------------------------------------
 
-  WGPUTexelCopyBufferInfo dst = {};
-  dst.buffer = staging;
-  dst.layout.bytesPerRow = padded_bytes_per_row;
-  dst.layout.rowsPerImage = height;
+static uint16_t fp16_bits_to_f16(float f) { return f32_to_f16(f); }
+static float fp16_bits_to_f32(uint16_t h) {
+  uint32_t sign = (uint32_t)(h & 0x8000u) << 16;
+  uint32_t exp  = (h & 0x7C00u) >> 10;
+  uint32_t mant = h & 0x03FFu;
+  if (exp == 0 && mant == 0) { float r; memcpy(&r, &sign, 4); return r; }
+  if (exp == 31) { uint32_t b = sign | 0x7F800000u | (mant << 13);
+                   float r; memcpy(&r, &b, 4); return r; }
+  uint32_t b = sign | ((exp + 112u) << 23) | (mant << 13);
+  float r; memcpy(&r, &b, 4); return r;
+}
 
-  WGPUExtent3D copy_size = {(uint32_t)(width), (uint32_t)(height), 1};
+struct MapState { bool done = false; WGPUMapAsyncStatus status = {}; };
 
-  wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+static std::vector<float> readback_rgba16f(WGPUDevice device, WGPUQueue queue,
+                                            WGPUTexture tex, int W, int H) {
+  const uint32_t bytes_per_px  = 8;
+  const uint32_t raw_bpr       = (uint32_t)(W * bytes_per_px);
+  const uint32_t aligned_bpr   = ((raw_bpr + 255u) / 256u) * 256u;
+  const size_t   buf_size      = (size_t)aligned_bpr * (size_t)H;
 
-  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
-  wgpuQueueSubmit(queue, 1, &commands);
-  wgpuCommandBufferRelease(commands);
-  wgpuCommandEncoderRelease(encoder);
+  WGPUBufferDescriptor bd = {};
+  bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+  bd.size  = buf_size;
+  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd);
 
-  // Wait for copy to complete
+  WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr);
+  WGPUTexelCopyTextureInfo src = {}; src.texture = tex;
+  WGPUTexelCopyBufferInfo  dst = {};
+  dst.buffer              = staging;
+  dst.layout.bytesPerRow  = aligned_bpr;
+  dst.layout.rowsPerImage = (uint32_t)H;
+  WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1};
+  wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &ext);
+  WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+  wgpuQueueSubmit(queue, 1, &cmds);
+  wgpuCommandBufferRelease(cmds);
+  wgpuCommandEncoderRelease(enc);
   wgpuDevicePoll(device, true, nullptr);
 
-  // Map and read buffer
-  struct MapState {
-    bool done = false;
+  MapState ms = {};
+  WGPUBufferMapCallbackInfo mi = {};
+  mi.mode     = WGPUCallbackMode_AllowProcessEvents;
+  mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) {
+    auto* st = (MapState*)u; st->status = s; st->done = true;
   };
-  MapState map_state;
-
-  auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
-                   void* userdata1, void* userdata2) {
-    (void)message;
-    (void)userdata2;
-    MapState* state = (MapState*)userdata1;
-    state->done = (status == WGPUMapAsyncStatus_Success);
-  };
-
-  WGPUBufferMapCallbackInfo map_info = {};
-  map_info.mode = WGPUCallbackMode_AllowProcessEvents;
-  map_info.callback = map_cb;
-  map_info.userdata1 = &map_state;
-
-  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
-
-  // Wait for mapping to complete
-  for (int i = 0; i < 100 && !map_state.done; ++i) {
+  mi.userdata1 = &ms;
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi);
+  for (int i = 0; i < 200 && !ms.done; ++i)
     wgpuDevicePoll(device, true, nullptr);
-  }
-
-  if (!map_state.done) {
-    fprintf(stderr, "Error: Buffer mapping timed out\n");
-    wgpuBufferRelease(staging);
-    return std::vector<uint8_t>();
-  }
-
-  const uint32_t* mapped =
-      (const uint32_t*)wgpuBufferGetConstMappedRange(staging, 0, buffer_size);
-
-  std::vector<uint8_t> result(width * height * 4);
 
-  // Unpack f16 to u8 (BGRA)
-  for (int y = 0; y < height; ++y) {
-    const uint32_t* row =
-        (const uint32_t*)((const uint8_t*)mapped + y * padded_bytes_per_row);
-    for (int x = 0; x < width; ++x) {
-      // Read 4×u32 (8×f16)
-      uint32_t data[4];
-      data[0] = row[x * 4 + 0];
-      data[1] = row[x * 4 + 1];
-      data[2] = row[x * 4 + 2];
-      data[3] = row[x * 4 + 3];
-
-      // Extract RGBA channels (first 4 f16 values)
-      uint16_t r16 = data[0] & 0xFFFF;
-      uint16_t g16 = (data[0] >> 16) & 0xFFFF;
-      uint16_t b16 = data[1] & 0xFFFF;
-      uint16_t a16 = (data[1] >> 16) & 0xFFFF;
-
-      // Convert f16 to f32 (simple decode)
-      auto f16_to_f32 = [](uint16_t h) -> float {
-        uint32_t sign = (h >> 15) & 1;
-        uint32_t exp = (h >> 10) & 0x1F;
-        uint32_t frac = h & 0x3FF;
-
-        if (exp == 0) {
-          if (frac == 0)
-            return sign ? -0.0f : 0.0f;
-          // Denormal
-          float val = frac / 1024.0f / 16384.0f;
-          return sign ? -val : val;
+  std::vector<float> pixels(W * H * 4, 0.0f);
+  if (ms.done && ms.status == WGPUMapAsyncStatus_Success) {
+    const uint8_t* mapped = (const uint8_t*)
+        wgpuBufferGetConstMappedRange(staging, 0, buf_size);
+    if (mapped) {
+      for (int y = 0; y < H; ++y) {
+        const uint16_t* row = (const uint16_t*)(mapped + (size_t)y * aligned_bpr);
+        for (int x = 0; x < W; ++x) {
+          for (int c = 0; c < 4; ++c)
+            pixels[(y * W + x) * 4 + c] = fp16_bits_to_f32(row[x * 4 + c]);
         }
-        if (exp == 31) {
-          return frac ? NAN : (sign ? -INFINITY : INFINITY);
-        }
-
-        int32_t e = exp - 15;
-        float val = (1.0f + frac / 1024.0f) * powf(2.0f, e);
-        return sign ? -val : val;
-      };
-
-      float r = f16_to_f32(r16);
-      float g = f16_to_f32(g16);
-      float b = f16_to_f32(b16);
-      float a = f16_to_f32(a16);
-
-      // Clamp to [0,1] and convert to u8
-      auto clamp_u8 = [](float v) -> uint8_t {
-        if (v <= 0.0f)
-          return 0;
-        if (v >= 1.0f)
-          return 255;
-        return (uint8_t)(v * 255.0f + 0.5f);
-      };
-
-      result[(y * width + x) * 4 + 0] = clamp_u8(b);
-      result[(y * width + x) * 4 + 1] = clamp_u8(g);
-      result[(y * width + x) * 4 + 2] = clamp_u8(r);
-      result[(y * width + x) * 4 + 3] = clamp_u8(a);
+      }
     }
   }
-
   wgpuBufferUnmap(staging);
   wgpuBufferRelease(staging);
-
-  return result;
+  return pixels;
 }
 
-// Read RGBA32Uint and create 4x wide grayscale composite (each channel
-// side-by-side)
-static std::vector<uint8_t>
-readback_rgba32uint_to_composite(WGPUDevice device, WGPUQueue queue,
-                                 WGPUTexture texture, int width, int height) {
-  // First get BGRA8 data
-  std::vector<uint8_t> bgra =
-      readback_rgba32uint_to_bgra8(device, queue, texture, width, height);
-  if (bgra.empty())
-    return {};
-
-  // Create 4x wide grayscale image (one channel per horizontal strip)
-  int composite_width = width * 4;
-  std::vector<uint8_t> composite(composite_width * height);
-
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      int src_idx = (y * width + x) * 4;
-      uint8_t b = bgra[src_idx + 0];
-      uint8_t g = bgra[src_idx + 1];
-      uint8_t r = bgra[src_idx + 2];
-      uint8_t a = bgra[src_idx + 3];
+// ---------------------------------------------------------------------------
+// Image I/O helpers
+// ---------------------------------------------------------------------------
 
-      // Convert each channel to grayscale luminance
-      auto to_gray = [](uint8_t val) -> uint8_t { return val; };
-
-      // Place each channel in its horizontal strip
-      composite[y * composite_width + (0 * width + x)] =
-          to_gray(r); // Channel 0
-      composite[y * composite_width + (1 * width + x)] =
-          to_gray(g); // Channel 1
-      composite[y * composite_width + (2 * width + x)] =
-          to_gray(b); // Channel 2
-      composite[y * composite_width + (3 * width + x)] =
-          to_gray(a); // Channel 3
-    }
+static std::vector<float> load_png_rgb(const char* path, int* out_w, int* out_h) {
+  int w, h, ch;
+  uint8_t* data = stbi_load(path, &w, &h, &ch, 3);
+  if (!data) {
+    fprintf(stderr, "Error: cannot load '%s'\n", path);
+    return {};
   }
-
-  return composite;
+  *out_w = w; *out_h = h;
+  std::vector<float> out(w * h * 3);
+  for (int i = 0; i < w * h * 3; ++i)
+    out[i] = data[i] / 255.0f;
+  stbi_image_free(data);
+  return out;
 }
 
-// Process image with CNN v2
-static bool process_cnn_v2(WGPUDevice device, WGPUQueue queue,
-                           WGPUInstance instance, WGPUTexture input_texture,
-                           int width, int height, const Args& args) {
-  printf("Using CNN v2 (storage buffer architecture)\n");
-
-  // Load weights (from file or asset system)
-  size_t weights_size = 0;
-  const uint8_t* weights_data = nullptr;
-  std::vector<uint8_t> file_weights; // For file-based loading
-
-  if (args.weights_path) {
-    // Load from file
-    printf("Loading weights from '%s'...\n", args.weights_path);
-    FILE* f = fopen(args.weights_path, "rb");
-    if (!f) {
-      fprintf(stderr, "Error: failed to open weights file '%s'\n",
-              args.weights_path);
-      return false;
-    }
-
-    fseek(f, 0, SEEK_END);
-    weights_size = ftell(f);
-    fseek(f, 0, SEEK_SET);
-
-    file_weights.resize(weights_size);
-    size_t read = fread(file_weights.data(), 1, weights_size, f);
-    fclose(f);
-
-    if (read != weights_size) {
-      fprintf(stderr, "Error: failed to read weights file\n");
-      return false;
-    }
-
-    weights_data = file_weights.data();
-  } else {
-    // Load from asset system
-    weights_data =
-        (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size);
+// Load 2-channel (RG) from RGB PNG — takes first 2 channels
+static std::vector<float> load_png_rg(const char* path, int ew, int eh) {
+  int w, h, ch;
+  uint8_t* data = stbi_load(path, &w, &h, &ch, 3);
+  if (!data || w != ew || h != eh) {
+    if (data) stbi_image_free(data);
+    fprintf(stderr, "Warning: cannot load normal '%s' — using (0.5,0.5)\n", path);
+    std::vector<float> def(ew * eh * 2, 0.5f);
+    return def;
   }
-
-  if (!weights_data || weights_size < 20) {
-    fprintf(stderr, "Error: CNN v2 weights not available\n");
-    return false;
+  std::vector<float> out(w * h * 2);
+  for (int i = 0; i < w * h; ++i) {
+    out[i * 2    ] = data[i * 3    ] / 255.0f;
+    out[i * 2 + 1] = data[i * 3 + 1] / 255.0f;
   }
+  stbi_image_free(data);
+  return out;
+}
 
-  // Parse header
-  const uint32_t* header = (const uint32_t*)weights_data;
-  uint32_t magic = header[0];
-  uint32_t version = header[1];
-  uint32_t num_layers = header[2];
-  uint32_t total_weights = header[3];
-
-  if (magic != 0x324e4e43) { // 'CNN2'
-    fprintf(stderr, "Error: Invalid CNN v2 weights magic\n");
-    return false;
+// Load 16-bit greyscale PNG → [0,1]
+static std::vector<float> load_png_depth16(const char* path, int ew, int eh) {
+  int w, h, ch;
+  uint16_t* data = stbi_load_16(path, &w, &h, &ch, 1);
+  if (!data || w != ew || h != eh) {
+    if (data) stbi_image_free(data);
+    fprintf(stderr, "Warning: cannot load depth '%s' — using 0\n", path);
+    return std::vector<float>(ew * eh, 0.0f);
   }
+  std::vector<float> out(w * h);
+  for (int i = 0; i < w * h; ++i)
+    out[i] = data[i] / 65535.0f;
+  stbi_image_free(data);
+  return out;
+}
 
-  uint32_t mip_level = 0;
-  if (version == 2) {
-    mip_level = header[4];
+// Load 8-bit greyscale PNG → [0,1]
+static std::vector<float> load_png_gray(const char* path, int ew, int eh,
+                                         float default_val = 0.0f) {
+  int w, h, ch;
+  uint8_t* data = stbi_load(path, &w, &h, &ch, 1);
+  if (!data || w != ew || h != eh) {
+    if (data) stbi_image_free(data);
+    return std::vector<float>(ew * eh, default_val);
   }
+  std::vector<float> out(w * h);
+  for (int i = 0; i < w * h; ++i)
+    out[i] = data[i] / 255.0f;
+  stbi_image_free(data);
+  return out;
+}
 
-  printf("Loaded CNN v2 weights: %u layers, %u weights, version %u\n",
-         num_layers, total_weights, version);
-
-  // Parse layer info
-  const uint32_t header_u32_count = (version == 1) ? 4 : 5;
-  const uint32_t* layer_data = header + header_u32_count;
-  std::vector<CNNv2LayerInfo> layer_info;
-
-  for (uint32_t i = 0; i < num_layers; ++i) {
-    CNNv2LayerInfo info;
-    info.kernel_size = layer_data[i * 5 + 0];
-    info.in_channels = layer_data[i * 5 + 1];
-    info.out_channels = layer_data[i * 5 + 2];
-    info.weight_offset = layer_data[i * 5 + 3];
-    info.weight_count = layer_data[i * 5 + 4];
-    layer_info.push_back(info);
-
-    printf("  Layer %u: %ux%u conv, %u→%u channels, %u weights\n", i,
-           info.kernel_size, info.kernel_size, info.in_channels,
-           info.out_channels, info.weight_count);
+static bool save_png(const char* path, const std::vector<float>& rgba_f32,
+                     int w, int h) {
+  std::vector<uint8_t> rgba8(w * h * 4);
+  for (int i = 0; i < w * h * 4; ++i) {
+    int v = (int)(rgba_f32[i] * 255.0f + 0.5f);
+    rgba8[i] = (uint8_t)(v < 0 ? 0 : v > 255 ? 255 : v);
   }
-
-  // Create weights storage buffer (skip header + layer info, upload only
-  // weights)
-  size_t header_size = 20;                         // 5 u32
-  size_t layer_info_size = 20 * layer_info.size(); // 5 u32 per layer
-  size_t weights_offset = header_size + layer_info_size;
-  size_t weights_only_size = weights_size - weights_offset;
-
-  WGPUBufferDescriptor weights_buffer_desc = {};
-  weights_buffer_desc.size = weights_only_size;
-  weights_buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst;
-  weights_buffer_desc.mappedAtCreation = false;
-
-  WGPUBuffer weights_buffer =
-      wgpuDeviceCreateBuffer(device, &weights_buffer_desc);
-  wgpuQueueWriteBuffer(queue, weights_buffer, 0, weights_data + weights_offset,
-                       weights_only_size);
-
-  // Create input view
-  WGPUTextureView input_view =
-      gpu_create_texture_view_2d(input_texture, WGPUTextureFormat_BGRA8Unorm);
-
-  // Create static features texture (RGBA32Uint)
-  const WGPUTextureDescriptor static_desc = {
-      .usage = WGPUTextureUsage_StorageBinding |
-               WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc,
-      .dimension = WGPUTextureDimension_2D,
-      .size = {(uint32_t)(width), (uint32_t)(height), 1},
-      .format = WGPUTextureFormat_RGBA32Uint,
-      .mipLevelCount = 1,
-      .sampleCount = 1,
-  };
-  WGPUTexture static_features_tex =
-      wgpuDeviceCreateTexture(device, &static_desc);
-  WGPUTextureView static_features_view =
-      wgpuTextureCreateView(static_features_tex, nullptr);
-
-  // Load depth from input alpha channel (or 1.0 if no alpha)
-  WGPUTexture depth_texture =
-      load_depth_from_alpha(device, queue, args.input_path, width, height);
-  if (!depth_texture) {
-    wgpuTextureViewRelease(static_features_view);
-    wgpuTextureRelease(static_features_tex);
-    wgpuBufferRelease(weights_buffer);
-    wgpuTextureViewRelease(input_view);
+  if (!stbi_write_png(path, w, h, 4, rgba8.data(), w * 4)) {
+    fprintf(stderr, "Error: failed to write '%s'\n", path);
     return false;
   }
-  WGPUTextureView depth_view = wgpuTextureCreateView(depth_texture, nullptr);
-
-  // Create layer textures (ping-pong)
-  WGPUTexture layer_textures[2] = {
-      wgpuDeviceCreateTexture(device, &static_desc),
-      wgpuDeviceCreateTexture(device, &static_desc),
-  };
-  WGPUTextureView layer_views[2] = {
-      wgpuTextureCreateView(layer_textures[0], nullptr),
-      wgpuTextureCreateView(layer_textures[1], nullptr),
-  };
+  return true;
+}
 
-  // Load shaders
-  const char* static_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC);
-  const char* layer_shader = SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE);
+// ---------------------------------------------------------------------------
+// Weight loading
+// ---------------------------------------------------------------------------
 
-  if (!static_shader[0] || !layer_shader[0]) {
-    fprintf(stderr, "Error: CNN v2 shaders not available\n");
-    wgpuTextureViewRelease(static_features_view);
-    wgpuTextureRelease(static_features_tex);
-    wgpuTextureViewRelease(depth_view);
-    wgpuTextureRelease(depth_texture);
-    wgpuTextureViewRelease(layer_views[0]);
-    wgpuTextureViewRelease(layer_views[1]);
-    wgpuTextureRelease(layer_textures[0]);
-    wgpuTextureRelease(layer_textures[1]);
-    wgpuBufferRelease(weights_buffer);
-    wgpuTextureViewRelease(input_view);
+static bool load_weights_bin(const char* path, std::vector<uint32_t>& out) {
+  FILE* f = fopen(path, "rb");
+  if (!f) {
+    fprintf(stderr, "Error: cannot open weights '%s'\n", path);
     return false;
   }
-
-  // Create static feature params buffer
-  WGPUBufferDescriptor static_params_desc = {};
-  static_params_desc.size = sizeof(CNNv2StaticFeatureParams);
-  static_params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
-  static_params_desc.mappedAtCreation = false;
-
-  WGPUBuffer static_params_buffer =
-      wgpuDeviceCreateBuffer(device, &static_params_desc);
-
-  CNNv2StaticFeatureParams static_params;
-  static_params.mip_level = mip_level;
-  static_params.padding[0] = 0;
-  static_params.padding[1] = 0;
-  static_params.padding[2] = 0;
-  wgpuQueueWriteBuffer(queue, static_params_buffer, 0, &static_params,
-                       sizeof(static_params));
-
-  // Create linear sampler for bilinear interpolation
-  WGPUSamplerDescriptor linear_sampler_desc = {};
-  linear_sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge;
-  linear_sampler_desc.addressModeV = WGPUAddressMode_ClampToEdge;
-  linear_sampler_desc.addressModeW = WGPUAddressMode_ClampToEdge;
-  linear_sampler_desc.magFilter = WGPUFilterMode_Linear;
-  linear_sampler_desc.minFilter = WGPUFilterMode_Linear;
-  linear_sampler_desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
-  linear_sampler_desc.lodMinClamp = 0.0f;
-  linear_sampler_desc.lodMaxClamp = 32.0f;
-  linear_sampler_desc.maxAnisotropy = 1;
-
-  WGPUSampler linear_sampler =
-      wgpuDeviceCreateSampler(device, &linear_sampler_desc);
-
-  // Create static features compute pipeline
-  WGPUShaderSourceWGSL static_wgsl = {};
-  static_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
-  static_wgsl.code = str_view(static_shader);
-
-  WGPUShaderModuleDescriptor static_module_desc = {};
-  static_module_desc.nextInChain = &static_wgsl.chain;
-
-  WGPUShaderModule static_module =
-      wgpuDeviceCreateShaderModule(device, &static_module_desc);
-
-  // Bind group layout: 0=input, 1=input_mip1, 2=input_mip2, 3=depth, 4=output,
-  // 5=params, 6=linear_sampler
-  WGPUBindGroupLayoutEntry static_bgl_entries[7] = {};
-  static_bgl_entries[0].binding = 0;
-  static_bgl_entries[0].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float;
-  static_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  static_bgl_entries[1].binding = 1;
-  static_bgl_entries[1].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float;
-  static_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  static_bgl_entries[2].binding = 2;
-  static_bgl_entries[2].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float;
-  static_bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  static_bgl_entries[3].binding = 3;
-  static_bgl_entries[3].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[3].texture.sampleType =
-      WGPUTextureSampleType_UnfilterableFloat;
-  static_bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  static_bgl_entries[4].binding = 4;
-  static_bgl_entries[4].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[4].storageTexture.access =
-      WGPUStorageTextureAccess_WriteOnly;
-  static_bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
-  static_bgl_entries[4].storageTexture.viewDimension =
-      WGPUTextureViewDimension_2D;
-
-  static_bgl_entries[5].binding = 5;
-  static_bgl_entries[5].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[5].buffer.type = WGPUBufferBindingType_Uniform;
-  static_bgl_entries[5].buffer.minBindingSize =
-      sizeof(CNNv2StaticFeatureParams);
-
-  static_bgl_entries[6].binding = 6;
-  static_bgl_entries[6].visibility = WGPUShaderStage_Compute;
-  static_bgl_entries[6].sampler.type = WGPUSamplerBindingType_Filtering;
-
-  WGPUBindGroupLayoutDescriptor static_bgl_desc = {};
-  static_bgl_desc.entryCount = 7;
-  static_bgl_desc.entries = static_bgl_entries;
-
-  WGPUBindGroupLayout static_bgl =
-      wgpuDeviceCreateBindGroupLayout(device, &static_bgl_desc);
-
-  WGPUPipelineLayoutDescriptor static_pl_desc = {};
-  static_pl_desc.bindGroupLayoutCount = 1;
-  static_pl_desc.bindGroupLayouts = &static_bgl;
-
-  WGPUPipelineLayout static_pl =
-      wgpuDeviceCreatePipelineLayout(device, &static_pl_desc);
-
-  WGPUComputePipelineDescriptor static_pipeline_desc = {};
-  static_pipeline_desc.compute.module = static_module;
-  static_pipeline_desc.compute.entryPoint = str_view("main");
-  static_pipeline_desc.layout = static_pl;
-
-  WGPUComputePipeline static_pipeline =
-      wgpuDeviceCreateComputePipeline(device, &static_pipeline_desc);
-
-  wgpuShaderModuleRelease(static_module);
-  wgpuPipelineLayoutRelease(static_pl);
-
-  // Create static bind group (use input as all mips for simplicity)
-  WGPUBindGroupEntry static_bg_entries[7] = {};
-  static_bg_entries[0].binding = 0;
-  static_bg_entries[0].textureView = input_view;
-  static_bg_entries[1].binding = 1;
-  static_bg_entries[1].textureView = input_view;
-  static_bg_entries[2].binding = 2;
-  static_bg_entries[2].textureView = input_view;
-  static_bg_entries[3].binding = 3;
-  static_bg_entries[3].textureView =
-      depth_view; // Depth from alpha channel (matches training)
-  static_bg_entries[4].binding = 4;
-  static_bg_entries[4].textureView = static_features_view;
-  static_bg_entries[5].binding = 5;
-  static_bg_entries[5].buffer = static_params_buffer;
-  static_bg_entries[5].size = sizeof(CNNv2StaticFeatureParams);
-  static_bg_entries[6].binding = 6;
-  static_bg_entries[6].sampler = linear_sampler;
-
-  WGPUBindGroupDescriptor static_bg_desc = {};
-  static_bg_desc.layout = static_bgl;
-  static_bg_desc.entryCount = 7;
-  static_bg_desc.entries = static_bg_entries;
-
-  WGPUBindGroup static_bg = wgpuDeviceCreateBindGroup(device, &static_bg_desc);
-
-  wgpuBindGroupLayoutRelease(static_bgl);
-
-  // Create layer compute pipeline
-  WGPUShaderSourceWGSL layer_wgsl = {};
-  layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
-  layer_wgsl.code = str_view(layer_shader);
-
-  WGPUShaderModuleDescriptor layer_module_desc = {};
-  layer_module_desc.nextInChain = &layer_wgsl.chain;
-
-  WGPUShaderModule layer_module =
-      wgpuDeviceCreateShaderModule(device, &layer_module_desc);
-
-  // Layer bind group layout:
-  // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params,
-  // 5=original
-  WGPUBindGroupLayoutEntry layer_bgl_entries[6] = {};
-  layer_bgl_entries[0].binding = 0;
-  layer_bgl_entries[0].visibility = WGPUShaderStage_Compute;
-  layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint;
-  layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  layer_bgl_entries[1].binding = 1;
-  layer_bgl_entries[1].visibility = WGPUShaderStage_Compute;
-  layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint;
-  layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  layer_bgl_entries[2].binding = 2;
-  layer_bgl_entries[2].visibility = WGPUShaderStage_Compute;
-  layer_bgl_entries[2].storageTexture.access =
-      WGPUStorageTextureAccess_WriteOnly;
-  layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
-  layer_bgl_entries[2].storageTexture.viewDimension =
-      WGPUTextureViewDimension_2D;
-
-  layer_bgl_entries[3].binding = 3;
-  layer_bgl_entries[3].visibility = WGPUShaderStage_Compute;
-  layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
-
-  layer_bgl_entries[4].binding = 4;
-  layer_bgl_entries[4].visibility = WGPUShaderStage_Compute;
-  layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform;
-  layer_bgl_entries[4].buffer.minBindingSize = sizeof(CNNv2LayerParams);
-
-  layer_bgl_entries[5].binding = 5;
-  layer_bgl_entries[5].visibility = WGPUShaderStage_Compute;
-  layer_bgl_entries[5].texture.sampleType = WGPUTextureSampleType_Float;
-  layer_bgl_entries[5].texture.viewDimension = WGPUTextureViewDimension_2D;
-
-  WGPUBindGroupLayoutDescriptor layer_bgl_desc = {};
-  layer_bgl_desc.entryCount = 6;
-  layer_bgl_desc.entries = layer_bgl_entries;
-
-  WGPUBindGroupLayout layer_bgl =
-      wgpuDeviceCreateBindGroupLayout(device, &layer_bgl_desc);
-
-  WGPUPipelineLayoutDescriptor layer_pl_desc = {};
-  layer_pl_desc.bindGroupLayoutCount = 1;
-  layer_pl_desc.bindGroupLayouts = &layer_bgl;
-
-  WGPUPipelineLayout layer_pl =
-      wgpuDeviceCreatePipelineLayout(device, &layer_pl_desc);
-
-  WGPUComputePipelineDescriptor layer_pipeline_desc = {};
-  layer_pipeline_desc.compute.module = layer_module;
-  layer_pipeline_desc.compute.entryPoint = str_view("main");
-  layer_pipeline_desc.layout = layer_pl;
-
-  WGPUComputePipeline layer_pipeline =
-      wgpuDeviceCreateComputePipeline(device, &layer_pipeline_desc);
-
-  wgpuShaderModuleRelease(layer_module);
-  wgpuPipelineLayoutRelease(layer_pl);
-
-  // Create layer params buffers
-  std::vector<WGPUBuffer> layer_params_buffers;
-  for (size_t i = 0; i < layer_info.size(); ++i) {
-    WGPUBufferDescriptor params_desc = {};
-    params_desc.size = sizeof(CNNv2LayerParams);
-    params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
-    params_desc.mappedAtCreation = false;
-
-    WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &params_desc);
-    layer_params_buffers.push_back(buf);
-  }
-
-  // Execute compute passes
-  WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
-
-  // Pass 1: Static features
-  printf("Computing static features...\n");
-  WGPUComputePassEncoder static_pass =
-      wgpuCommandEncoderBeginComputePass(encoder, nullptr);
-  wgpuComputePassEncoderSetPipeline(static_pass, static_pipeline);
-  wgpuComputePassEncoderSetBindGroup(static_pass, 0, static_bg, 0, nullptr);
-
-  uint32_t workgroups_x = (width + 7) / 8;
-  uint32_t workgroups_y = (height + 7) / 8;
-  wgpuComputePassEncoderDispatchWorkgroups(static_pass, workgroups_x,
-                                           workgroups_y, 1);
-
-  wgpuComputePassEncoderEnd(static_pass);
-  wgpuComputePassEncoderRelease(static_pass);
-
-  // Save static features if requested
-  if (args.save_intermediates) {
-    // Submit and wait for static features to complete
-    WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr);
-    wgpuQueueSubmit(queue, 1, &cmd);
-    wgpuCommandBufferRelease(cmd);
-    wgpuDevicePoll(device, true, nullptr);
-
-    // Create new encoder for layers
-    encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
-
-    char layer_path[512];
-    snprintf(layer_path, sizeof(layer_path), "%s/static_features.png",
-             args.save_intermediates);
-    printf("Saving static features to '%s'...\n", layer_path);
-
-    // Read back RGBA32Uint and create 8-channel grayscale composite
-    // Static features has 8 channels (packed as 4×u32), create 8x wide
-    // composite
-    std::vector<uint8_t> bgra = readback_rgba32uint_to_bgra8(
-        device, queue, static_features_tex, width, height);
-
-    if (!bgra.empty()) {
-      // Static features: 8 f16 values packed in 4×u32
-      // For now, just show first 4 channels (like layers)
-      // TODO: Show all 8 channels in 8x wide composite
-      std::vector<uint8_t> composite = readback_rgba32uint_to_composite(
-          device, queue, static_features_tex, width, height);
-      if (!composite.empty()) {
-        int composite_width = width * 4;
-        if (!stbi_write_png(layer_path, composite_width, height, 1,
-                            composite.data(), composite_width)) {
-          fprintf(stderr, "Error: failed to write static features PNG\n");
-        }
-      }
-    }
+  fseek(f, 0, SEEK_END);
+  long sz = ftell(f);
+  rewind(f);
+  if (sz <= 0 || sz % 4 != 0) {
+    fprintf(stderr, "Error: bad weights file size %ld\n", sz);
+    fclose(f);
+    return false;
   }
-
-  // Pass 2-N: CNN layers
-  for (size_t i = 0; i < layer_info.size(); ++i) {
-    const CNNv2LayerInfo& info = layer_info[i];
-
-    printf("Processing layer %zu/%zu (%ux%u, %u→%u channels)...\n", i + 1,
-           layer_info.size(), info.kernel_size, info.kernel_size,
-           info.in_channels, info.out_channels);
-
-    // Update layer params
-    CNNv2LayerParams params;
-    params.kernel_size = info.kernel_size;
-    params.in_channels = info.in_channels;
-    params.out_channels = info.out_channels;
-    params.weight_offset = info.weight_offset;
-    params.is_output_layer = (i == layer_info.size() - 1) ? 1 : 0;
-    params.blend_amount = args.blend;
-    params.is_layer_0 = (i == 0) ? 1 : 0;
-
-    wgpuQueueWriteBuffer(queue, layer_params_buffers[i], 0, &params,
-                         sizeof(params));
-
-    // Create bind group for this layer
-    WGPUBindGroupEntry layer_bg_entries[6] = {};
-    layer_bg_entries[0].binding = 0;
-    layer_bg_entries[0].textureView = static_features_view;
-
-    layer_bg_entries[1].binding = 1;
-    layer_bg_entries[1].textureView =
-        (i == 0) ? static_features_view : layer_views[i % 2];
-
-    layer_bg_entries[2].binding = 2;
-    layer_bg_entries[2].textureView = layer_views[(i + 1) % 2];
-
-    layer_bg_entries[3].binding = 3;
-    layer_bg_entries[3].buffer = weights_buffer;
-    layer_bg_entries[3].size = weights_only_size;
-
-    layer_bg_entries[4].binding = 4;
-    layer_bg_entries[4].buffer = layer_params_buffers[i];
-    layer_bg_entries[4].size = sizeof(CNNv2LayerParams);
-
-    layer_bg_entries[5].binding = 5;
-    layer_bg_entries[5].textureView = input_view;
-
-    WGPUBindGroupDescriptor layer_bg_desc = {};
-    layer_bg_desc.layout = layer_bgl;
-    layer_bg_desc.entryCount = 6;
-    layer_bg_desc.entries = layer_bg_entries;
-
-    WGPUBindGroup layer_bg = wgpuDeviceCreateBindGroup(device, &layer_bg_desc);
-
-    WGPUComputePassEncoder layer_pass =
-        wgpuCommandEncoderBeginComputePass(encoder, nullptr);
-    wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline);
-    wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bg, 0, nullptr);
-
-    wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x,
-                                             workgroups_y, 1);
-
-    wgpuComputePassEncoderEnd(layer_pass);
-    wgpuComputePassEncoderRelease(layer_pass);
-    wgpuBindGroupRelease(layer_bg);
-
-    // Save intermediate layer if requested
-    if (args.save_intermediates) {
-      // Submit and wait for layer to complete
-      WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr);
-      wgpuQueueSubmit(queue, 1, &cmd);
-      wgpuCommandBufferRelease(cmd);
-      wgpuDevicePoll(device, true, nullptr);
-
-      // Create new encoder for next layer
-      encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
-
-      char layer_path[512];
-      snprintf(layer_path, sizeof(layer_path), "%s/layer_%zu.png",
-               args.save_intermediates, i);
-      printf("Saving intermediate layer %zu to '%s'...\n", i, layer_path);
-
-      // Read back RGBA32Uint and create 4-channel grayscale composite
-      WGPUTexture output_tex = layer_textures[(i + 1) % 2];
-      std::vector<uint8_t> composite = readback_rgba32uint_to_composite(
-          device, queue, output_tex, width, height);
-
-      if (!composite.empty()) {
-        int composite_width = width * 4;
-        if (!stbi_write_png(layer_path, composite_width, height, 1,
-                            composite.data(), composite_width)) {
-          fprintf(stderr, "Error: failed to write layer PNG\n");
-        }
-      }
-    }
+  out.resize((size_t)sz / 4);
+  if ((long)fread(out.data(), 4, out.size(), f) != sz / 4) {
+    fprintf(stderr, "Error: read failed for '%s'\n", path);
+    fclose(f);
+    return false;
   }
+  fclose(f);
+  return true;
+}
 
-  WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
-  wgpuQueueSubmit(queue, 1, &commands);
-  wgpuCommandBufferRelease(commands);
-  wgpuCommandEncoderRelease(encoder);
-
-  wgpuDevicePoll(device, true, nullptr);
-
-  // Create layer composite if intermediates were saved
-  if (args.save_intermediates) {
-    save_layer_composite(args.save_intermediates, width, height,
-                         layer_info.size());
-  }
+// ---------------------------------------------------------------------------
+// Args
+// ---------------------------------------------------------------------------
 
-  // Readback final result (from last layer's output texture)
-  printf("Reading pixels from GPU...\n");
-  size_t final_layer_idx = (layer_info.size()) % 2;
-  std::vector<uint8_t> pixels = readback_rgba32uint_to_bgra8(
-      device, queue, layer_textures[final_layer_idx], width, height);
+struct Args {
+  const char* input_path   = nullptr;
+  const char* output_path  = nullptr;
+  const char* sample_dir   = nullptr;
+  const char* weights_path = nullptr;
+  bool        debug_hex    = false;
+};
 
-  if (pixels.empty()) {
-    fprintf(stderr, "Error: GPU readback failed\n");
-    for (auto buf : layer_params_buffers)
-      wgpuBufferRelease(buf);
-    wgpuComputePipelineRelease(layer_pipeline);
-    wgpuBindGroupLayoutRelease(layer_bgl);
-    wgpuBindGroupRelease(static_bg);
-    wgpuComputePipelineRelease(static_pipeline);
-    wgpuBufferRelease(static_params_buffer);
-    wgpuTextureViewRelease(static_features_view);
-    wgpuTextureRelease(static_features_tex);
-    wgpuTextureViewRelease(depth_view);
-    wgpuTextureRelease(depth_texture);
-    wgpuTextureViewRelease(layer_views[0]);
-    wgpuTextureViewRelease(layer_views[1]);
-    wgpuTextureRelease(layer_textures[0]);
-    wgpuTextureRelease(layer_textures[1]);
-    wgpuBufferRelease(weights_buffer);
-    wgpuTextureViewRelease(input_view);
-    return false;
-  }
+static void print_usage(const char* prog) {
+  fprintf(stderr, "Usage: %s input.png output.png [OPTIONS]\n", prog);
+  fprintf(stderr, "\nOPTIONS:\n");
+  fprintf(stderr, "  --sample-dir DIR   Full sample dir with albedo/normal/depth/matid/shadow/transp\n");
+  fprintf(stderr, "  --weights FILE     Load weights from cnn_v3_weights.bin\n");
+  fprintf(stderr, "  --debug-hex        Print first 8 output pixels as hex\n");
+  fprintf(stderr, "  --help             Show this help\n");
+  fprintf(stderr, "\nSimple mode (single PNG): geometry channels zeroed, normal=(0.5,0.5).\n");
+  fprintf(stderr, "FiLM is always identity (gamma=1, beta=0).\n");
+  fprintf(stderr, "\nNote: feature packing uses [0,1] oct-normals (training format) to match\n");
+  fprintf(stderr, "      infer_cnn_v3.py for direct Python/WGSL comparison.\n");
+}
 
-  // Debug hex dump
-  if (args.debug_hex) {
-    printf("First 8 pixels (BGRA hex):\n");
-    for (int i = 0; i < 8 && i < width * height; ++i) {
-      const uint8_t b = pixels[i * 4 + 0];
-      const uint8_t g = pixels[i * 4 + 1];
-      const uint8_t r = pixels[i * 4 + 2];
-      const uint8_t a = pixels[i * 4 + 3];
-      printf("  [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
+static bool parse_args(int argc, char** argv, Args* args) {
+  if (argc < 3) return false;
+  args->input_path  = argv[1];
+  args->output_path = argv[2];
+  for (int i = 3; i < argc; ++i) {
+    if (strcmp(argv[i], "--sample-dir") == 0 && i + 1 < argc) {
+      args->sample_dir = argv[++i];
+    } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) {
+      args->weights_path = argv[++i];
+    } else if (strcmp(argv[i], "--debug-hex") == 0) {
+      args->debug_hex = true;
+    } else if (strcmp(argv[i], "--help") == 0) {
+      return false;
+    } else {
+      fprintf(stderr, "Error: unknown option '%s'\n", argv[i]);
+      return false;
     }
   }
+  return true;
+}
 
-  // Save output
-  bool success;
-  if (args.output_png) {
-    printf("Saving PNG to '%s'...\n", args.output_path);
-    success = save_png(args.output_path, pixels, width, height);
-  } else {
-    printf("Saving PPM to '%s'...\n", args.output_path);
-    success = save_ppm(args.output_path, pixels, width, height);
-  }
-
-  if (success) {
-    printf("Done! Output saved to '%s'\n", args.output_path);
-  }
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
 
-  // Cleanup
-  for (auto buf : layer_params_buffers)
-    wgpuBufferRelease(buf);
-  wgpuComputePipelineRelease(layer_pipeline);
-  wgpuBindGroupLayoutRelease(layer_bgl);
-  wgpuBindGroupRelease(static_bg);
-  wgpuComputePipelineRelease(static_pipeline);
-  wgpuBufferRelease(static_params_buffer);
-  wgpuTextureViewRelease(static_features_view);
-  wgpuTextureRelease(static_features_tex);
-  wgpuTextureViewRelease(layer_views[0]);
-  wgpuTextureViewRelease(layer_views[1]);
-  wgpuTextureRelease(layer_textures[0]);
-  wgpuTextureRelease(layer_textures[1]);
-  wgpuBufferRelease(weights_buffer);
-  wgpuTextureViewRelease(input_view);
-
-  return success;
-}
+extern void InitShaderComposer();
 
 int main(int argc, char** argv) {
-  // Parse arguments
   Args args;
   if (!parse_args(argc, argv, &args)) {
     print_usage(argv[0]);
     return 1;
   }
 
-  // Initialize shader composer (required for #include resolution)
-  InitShaderComposer();
-
-  // Initialize WebGPU
+  // Init GPU
   WebGPUTestFixture fixture;
   if (!fixture.init()) {
-    fprintf(stderr, "Error: GPU unavailable\n");
+    fprintf(stderr, "Error: WebGPU device unavailable\n");
     return 1;
   }
+  InitShaderComposer();
 
   GpuContext ctx = fixture.ctx();
-  WGPUDevice device = ctx.device;
-  WGPUQueue queue = ctx.queue;
-  WGPUInstance instance = fixture.instance();
 
-  // Load input texture
-  int width, height;
-  WGPUTexture input_texture =
-      load_texture(device, queue, args.input_path, &width, &height);
-  if (!input_texture) {
-    SamplerCache::Get().clear();
-    fixture.shutdown();
-    return 1;
-  }
-
-  printf("Loaded %dx%d image from '%s'\n", width, height, args.input_path);
+  // --- Load input image ---
+  int W, H;
+  std::vector<float> albedo = load_png_rgb(args.input_path, &W, &H);
+  if (albedo.empty()) return 1;
 
-  // Branch based on CNN version
-  if (args.cnn_version == 2) {
-    bool success = process_cnn_v2(device, queue, instance, input_texture, width,
-                                  height, args);
-    wgpuTextureRelease(input_texture);
-    SamplerCache::Get().clear();
-    fixture.shutdown();
-    return success ? 0 : 1;
+  // Pad to multiples of 4 (U-Net requires 2 pooling levels)
+  const int W4 = (W + 3) & ~3;
+  const int H4 = (H + 3) & ~3;
+  if (W4 != W || H4 != H) {
+    printf("Padding %dx%d → %dx%d\n", W, H, W4, H4);
+    std::vector<float> padded(W4 * H4 * 3, 0.0f);
+    for (int y = 0; y < H; ++y)
+      for (int x = 0; x < W; ++x)
+        for (int c = 0; c < 3; ++c)
+          padded[(y * W4 + x) * 3 + c] = albedo[(y * W + x) * 3 + c];
+    albedo = std::move(padded);
+    W = W4; H = H4;
   }
 
-  // CNN v1 processing below
-  printf("Using CNN v1 (render pipeline architecture)\n");
+  printf("Input: %s  (%dx%d)\n", args.input_path, W, H);
 
-  // Create input texture view
-  WGPUTextureView input_view =
-      gpu_create_texture_view_2d(input_texture, WGPUTextureFormat_BGRA8Unorm);
-  WGPUTextureView original_view = input_view; // Keep reference to original
+  // --- Build FeatureImages ---
+  FeatureImages img;
+  img.w = W; img.h = H;
+  img.albedo = albedo;
 
-  // Create CNN pipelines (different formats for intermediate vs final)
-  WGPURenderPipeline pipeline_intermediate =
-      create_cnn_pipeline(device, WGPUTextureFormat_RGBA16Float, false);
-  WGPURenderPipeline pipeline_final =
-      create_cnn_pipeline(device, WGPUTextureFormat_BGRA8Unorm, true);
-
-  if (!pipeline_intermediate || !pipeline_final) {
-    fprintf(stderr, "Error: failed to create CNN pipelines\n");
-    if (pipeline_intermediate)
-      wgpuRenderPipelineRelease(pipeline_intermediate);
-    if (pipeline_final)
-      wgpuRenderPipelineRelease(pipeline_final);
-    wgpuTextureViewRelease(input_view);
-    wgpuTextureRelease(input_texture);
-    SamplerCache::Get().clear();
-    fixture.shutdown();
-    return 1;
-  }
-
-  // Get bind group layout from intermediate pipeline (same for both)
-  WGPUBindGroupLayout bgl =
-      wgpuRenderPipelineGetBindGroupLayout(pipeline_intermediate, 0);
-
-  // Create uniform buffers
-  const WGPUBufferDescriptor common_uniform_desc = {
-      .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
-      .size = sizeof(UniformsSequenceParams),
-  };
-  WGPUBuffer common_uniform_buffer =
-      wgpuDeviceCreateBuffer(device, &common_uniform_desc);
-
-  const WGPUBufferDescriptor layer_params_desc = {
-      .usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst,
-      .size = sizeof(CNNv1LayerParams),
-  };
-  WGPUBuffer layer_params_buffer =
-      wgpuDeviceCreateBuffer(device, &layer_params_desc);
-
-  // Create intermediate textures for ping-pong (2 textures)
-  // Use RGBA16Float to preserve [-1,1] range from tanh activation
-  const WGPUTextureDescriptor intermediate_desc = {
-      .usage = WGPUTextureUsage_TextureBinding |
-               WGPUTextureUsage_RenderAttachment | WGPUTextureUsage_CopySrc,
-      .dimension = WGPUTextureDimension_2D,
-      .size = {(uint32_t)(width), (uint32_t)(height), 1},
-      .format = WGPUTextureFormat_RGBA16Float,
-      .mipLevelCount = 1,
-      .sampleCount = 1,
-  };
-
-  WGPUTexture intermediate_textures[2] = {
-      wgpuDeviceCreateTexture(device, &intermediate_desc),
-      wgpuDeviceCreateTexture(device, &intermediate_desc),
-  };
-
-  // Create views for intermediate textures (RGBA16Float)
-  WGPUTextureView intermediate_views[2] = {
-      gpu_create_texture_view_2d(intermediate_textures[0],
-                                 WGPUTextureFormat_RGBA16Float),
-      gpu_create_texture_view_2d(intermediate_textures[1],
-                                 WGPUTextureFormat_RGBA16Float),
-  };
-
-  // Get sampler
-  WGPUSampler sampler =
-      SamplerCache::Get().get_or_create(device, SamplerCache::clamp());
-
-  // Multi-layer processing
-  const int NUM_LAYERS = args.num_layers;
-  int dst_idx = 0; // Index of texture to render to
-
-  // First layer reads from input, subsequent layers read from previous output
-  WGPUTextureView current_input = input_view;
-
-  for (int layer = 0; layer < NUM_LAYERS; ++layer) {
-    printf("Processing layer %d/%d...\n", layer + 1, NUM_LAYERS);
-
-    // Update uniforms
-    UniformsSequenceParams common_u = {
-        .resolution = {(float)(width), (float)(height)},
-        .aspect_ratio = (float)(width) / (float)(height),
-        .time = 0.0f,
-        .beat_time = 0.0f,
-        .beat_phase = 0.0f,
-        .audio_intensity = 0.0f,
-        .noise = 0.0f,
-    };
-    wgpuQueueWriteBuffer(queue, common_uniform_buffer, 0, &common_u,
-                         sizeof(common_u));
-
-    CNNv1LayerParams layer_params = {
-        .layer_index = layer,
-        .blend_amount =
-            (layer == NUM_LAYERS - 1) ? args.blend : 1.0f, // Only final layer
-        ._pad = {0.0f, 0.0f},
+  if (args.sample_dir) {
+    printf("Mode: full  (%s)\n", args.sample_dir);
+    auto path = [&](const char* name) -> std::string {
+      return std::string(args.sample_dir) + "/" + name;
     };
-    wgpuQueueWriteBuffer(queue, layer_params_buffer, 0, &layer_params,
-                         sizeof(layer_params));
-
-    // Build bind group
-    WGPUBindGroup bind_group =
-        BindGroupBuilder()
-            .sampler(0, sampler)
-            .texture(1, current_input)
-            .buffer(2, common_uniform_buffer, sizeof(UniformsSequenceParams))
-            .buffer(3, layer_params_buffer, sizeof(CNNv1LayerParams))
-            .texture(4, original_view)
-            .build(device, bgl);
-
-    // Render to appropriate output texture with correct pipeline
-    bool is_final = (layer == NUM_LAYERS - 1);
-
-    if (is_final) {
-      // Final layer: use OffscreenRenderTarget (known working readback)
-      OffscreenRenderTarget rt(instance, device, width, height);
-      WGPUCommandEncoder encoder =
-          wgpuDeviceCreateCommandEncoder(device, nullptr);
-      WGPURenderPassEncoder pass = begin_render_pass(encoder, rt.view());
-      wgpuRenderPassEncoderSetPipeline(pass, pipeline_final);
-      wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
-      wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
-      wgpuRenderPassEncoderEnd(pass);
-      WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
-      wgpuQueueSubmit(queue, 1, &commands);
-      wgpuDevicePoll(device, true, nullptr);
-
-      wgpuCommandBufferRelease(commands);
-      wgpuRenderPassEncoderRelease(pass);
-      wgpuCommandEncoderRelease(encoder);
-      wgpuBindGroupRelease(bind_group);
+    img.normal = load_png_rg(path("normal.png").c_str(), W, H);
+    img.depth  = load_png_depth16(path("depth.png").c_str(), W, H);
+    img.matid  = load_png_gray(path("matid.png").c_str(),  W, H, 0.0f);
+    img.shadow = load_png_gray(path("shadow.png").c_str(), W, H, 1.0f);
+    img.transp = load_png_gray(path("transp.png").c_str(), W, H, 0.0f);
+  } else {
+    printf("Mode: simple (geometry zeroed, normal=(0.5,0.5))\n");
+    img.normal.assign(W * H * 2, 0.5f);
+    img.depth.assign(W * H, 0.0f);
+    img.matid.assign(W * H, 0.0f);
+    img.shadow.assign(W * H, 1.0f);
+    img.transp.assign(W * H, 0.0f);
+  }
 
-      // Read pixels immediately
-      printf("Reading pixels from GPU...\n");
-      std::vector<uint8_t> pixels = rt.read_pixels();
+  // --- Pack features ---
+  std::vector<uint32_t> feat0, feat1;
+  pack_features(img, feat0, feat1);
 
-      // Debug: print first 8 pixels as hex
-      if (args.debug_hex && !pixels.empty()) {
-        printf("First 8 pixels (BGRA hex):\n");
-        for (int i = 0; i < 8 && i < width * height; ++i) {
-          const uint8_t b = pixels[i * 4 + 0];
-          const uint8_t g = pixels[i * 4 + 1];
-          const uint8_t r = pixels[i * 4 + 2];
-          const uint8_t a = pixels[i * 4 + 3];
-          printf("  [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
-        }
-      }
+  // --- Create GPU textures ---
+  WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H);
+  WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H);
+  WGPUTexture out_tex   = make_output_tex(ctx.device, W, H);
 
-      if (pixels.empty()) {
-        fprintf(stderr, "Error: GPU readback failed\n");
-        wgpuTextureViewRelease(intermediate_views[0]);
-        wgpuTextureViewRelease(intermediate_views[1]);
-        wgpuTextureRelease(intermediate_textures[0]);
-        wgpuTextureRelease(intermediate_textures[1]);
-        wgpuTextureViewRelease(input_view);
-        wgpuTextureRelease(input_texture);
-        wgpuBufferRelease(layer_params_buffer);
-        wgpuBufferRelease(common_uniform_buffer);
-        wgpuBindGroupLayoutRelease(bgl);
-        wgpuRenderPipelineRelease(pipeline_final);
-        wgpuRenderPipelineRelease(pipeline_intermediate);
-        SamplerCache::Get().clear();
-        fixture.shutdown();
-        return 1;
-      }
+  WGPUTextureView feat0_view = make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint);
+  WGPUTextureView feat1_view = make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint);
+  WGPUTextureView out_view   = make_view(out_tex,   WGPUTextureFormat_RGBA16Float);
 
-      // Save output
-      bool success;
-      if (args.output_png) {
-        printf("Saving PNG to '%s'...\n", args.output_path);
-        success = save_png(args.output_path, pixels, width, height);
-      } else {
-        printf("Saving PPM to '%s'...\n", args.output_path);
-        success = save_ppm(args.output_path, pixels, width, height);
-      }
+  upload_tex(ctx.queue, feat0_tex, feat0.data(), W, H);
+  upload_tex(ctx.queue, feat1_tex, feat1.data(), W, H);
 
-      if (!success) {
-        wgpuTextureViewRelease(intermediate_views[0]);
-        wgpuTextureViewRelease(intermediate_views[1]);
-        wgpuTextureRelease(intermediate_textures[0]);
-        wgpuTextureRelease(intermediate_textures[1]);
-        wgpuTextureViewRelease(input_view);
-        wgpuTextureRelease(input_texture);
-        wgpuBufferRelease(layer_params_buffer);
-        wgpuBufferRelease(common_uniform_buffer);
-        wgpuBindGroupLayoutRelease(bgl);
-        wgpuRenderPipelineRelease(pipeline_final);
-        wgpuRenderPipelineRelease(pipeline_intermediate);
-        SamplerCache::Get().clear();
-        fixture.shutdown();
-        return 1;
-      }
+  // --- Wire CNNv3Effect ---
+  NodeRegistry registry(ctx.device, W, H);
+  registry.set_external_view("feat0", feat0_view);
+  registry.set_external_view("feat1", feat1_view);
+  registry.set_external_view("cnn_out", out_view);
 
-      printf("Done! Output saved to '%s'\n", args.output_path);
-      break; // Exit loop after final layer
-    } else {
-      // Intermediate layers: render to ping-pong textures
-      WGPUTextureView output_view = intermediate_views[dst_idx];
-      WGPUCommandEncoder encoder =
-          wgpuDeviceCreateCommandEncoder(device, nullptr);
-      WGPURenderPassEncoder pass = begin_render_pass(encoder, output_view);
-      wgpuRenderPassEncoderSetPipeline(pass, pipeline_intermediate);
-      wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group, 0, nullptr);
-      wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
-      wgpuRenderPassEncoderEnd(pass);
-      WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
-      wgpuQueueSubmit(queue, 1, &commands);
-      wgpuDevicePoll(device, true, nullptr);
+  CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn_out"}, 0.0f, 1000.0f);
+  effect.declare_nodes(registry);
 
-      wgpuCommandBufferRelease(commands);
-      wgpuRenderPassEncoderRelease(pass);
-      wgpuCommandEncoderRelease(encoder);
-      wgpuBindGroupRelease(bind_group);
+  // --- Load weights ---
+  if (args.weights_path) {
+    std::vector<uint32_t> wdata;
+    if (!load_weights_bin(args.weights_path, wdata)) return 1;
+    effect.upload_weights(ctx.queue, wdata.data(),
+                          (uint32_t)(wdata.size() * 4));
+    printf("Weights: %s  (%zu bytes)\n", args.weights_path, wdata.size() * 4);
+  } else {
+    printf("Weights: default (from assets, zero if absent)\n");
+  }
 
-      // Save intermediate layer if requested
-      if (args.save_intermediates) {
-        char layer_path[512];
-        snprintf(layer_path, sizeof(layer_path), "%s/layer_%d.png",
-                 args.save_intermediates, layer);
-        printf("Saving intermediate layer %d to '%s'...\n", layer, layer_path);
+  // --- Run 5 compute passes ---
+  WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+  UniformsSequenceParams params = {};
+  params.resolution   = {(float)W, (float)H};
+  params.aspect_ratio = (float)W / (float)H;
+  effect.render(enc, params, registry);
 
-        // Readback RGBA16Float texture
-        std::vector<uint8_t> pixels = texture_readback_fp16_to_u8(
-            device, queue, intermediate_textures[dst_idx], width, height);
+  WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+  wgpuQueueSubmit(ctx.queue, 1, &cmds);
+  wgpuCommandBufferRelease(cmds);
+  wgpuCommandEncoderRelease(enc);
+  wgpuDevicePoll(ctx.device, true, nullptr);
 
-        // Debug: print first 8 pixels as hex
-        if (args.debug_hex && !pixels.empty()) {
-          printf("Layer %d first 8 pixels (BGRA hex):\n", layer);
-          for (int i = 0; i < 8 && i < width * height; ++i) {
-            const uint8_t b = pixels[i * 4 + 0];
-            const uint8_t g = pixels[i * 4 + 1];
-            const uint8_t r = pixels[i * 4 + 2];
-            const uint8_t a = pixels[i * 4 + 3];
-            printf("  [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
-          }
-        }
+  // --- Readback ---
+  std::vector<float> pixels = readback_rgba16f(ctx.device, ctx.queue, out_tex, W, H);
 
-        if (!pixels.empty()) {
-          save_png(layer_path, pixels, width, height);
-        } else {
-          fprintf(stderr, "Warning: failed to read intermediate layer %d\n",
-                  layer);
-        }
-      }
-    }
+  // --- Save output (crop to original size, already same if no padding) ---
+  if (!save_png(args.output_path, pixels, W, H)) return 1;
+  printf("Saved: %s\n", args.output_path);
 
-    // Update for next layer: output becomes input
-    if (layer < NUM_LAYERS - 1) {
-      // Use this layer's output as next layer's input
-      current_input = intermediate_views[dst_idx];
-      dst_idx = 1 - dst_idx; // Flip ping-pong for next render
+  if (args.debug_hex) {
+    printf("First 8 output pixels (RGBA f32 → hex):\n");
+    for (int i = 0; i < 8 && i < W * H; ++i) {
+      float r = pixels[i*4  ], g = pixels[i*4+1];
+      float b = pixels[i*4+2], a = pixels[i*4+3];
+      int ri = (int)(r*255+.5f), gi = (int)(g*255+.5f);
+      int bi = (int)(b*255+.5f), ai = (int)(a*255+.5f);
+      ri = ri<0?0:ri>255?255:ri; gi = gi<0?0:gi>255?255:gi;
+      bi = bi<0?0:bi>255?255:bi; ai = ai<0?0:ai>255?255:ai;
+      printf("  [%d] 0x%02X%02X%02X%02X  (%.4f %.4f %.4f %.4f)\n",
+             i, ri, gi, bi, ai, r, g, b, a);
     }
   }
 
-  // Wait for all GPU work to complete before cleanup
-  wgpuDevicePoll(device, true, nullptr);
-
   // Cleanup
-  wgpuTextureViewRelease(intermediate_views[0]);
-  wgpuTextureViewRelease(intermediate_views[1]);
-  wgpuTextureRelease(intermediate_textures[0]);
-  wgpuTextureRelease(intermediate_textures[1]);
-  wgpuBufferRelease(layer_params_buffer);
-  wgpuBufferRelease(common_uniform_buffer);
-  wgpuBindGroupLayoutRelease(bgl);
-  wgpuRenderPipelineRelease(pipeline_intermediate);
-  wgpuRenderPipelineRelease(pipeline_final);
-  wgpuTextureViewRelease(input_view);
-  wgpuTextureRelease(input_texture);
-  SamplerCache::Get().clear();
-  fixture.shutdown();
+  wgpuTextureViewRelease(feat0_view);
+  wgpuTextureViewRelease(feat1_view);
+  wgpuTextureViewRelease(out_view);
+  wgpuTextureRelease(feat0_tex);
+  wgpuTextureRelease(feat1_tex);
+  wgpuTextureRelease(out_tex);
 
   return 0;
 }
diff --git a/workspaces/main/weights/cnn_v3_film_mlp.bin b/workspaces/main/weights/cnn_v3_film_mlp.bin
index 53fce42..288a9a8 100644
--- a/workspaces/main/weights/cnn_v3_film_mlp.bin
+++ b/workspaces/main/weights/cnn_v3_film_mlp.bin
diff --git a/workspaces/main/weights/cnn_v3_weights.bin b/workspaces/main/weights/cnn_v3_weights.bin
index a2f7480..f249d27 100644
--- a/workspaces/main/weights/cnn_v3_weights.bin
+++ b/workspaces/main/weights/cnn_v3_weights.bin