5 files changed, 503 insertions, 116 deletions
diff --git a/cnn_v3/docs/CNN_V3.md b/cnn_v3/docs/CNN_V3.md
index 4d58811..d775e2b 100644
--- a/cnn_v3/docs/CNN_V3.md
+++ b/cnn_v3/docs/CNN_V3.md
@@ -27,33 +27,7 @@ CNN v3 is a next-generation post-processing effect using:
 
 ### Pipeline Overview
 
-```
-G-Buffer (albedo, normal, depth, matID, UV)
-        │
-        ▼
-  FiLM Conditioning
-  (beat_time, audio_intensity, style_params)
-        │ → γ[], β[] per channel
-        ▼
-  U-Net
-  ┌─────────────────────────────────────────┐
-  │  Encoder                                │
-  │  enc0 (H×W, 4ch) ────────────skip──────┤
-  │  ↓ down (avg pool 2×2)                  │
-  │  enc1 (H/2×W/2, 8ch) ────────skip──────┤
-  │  ↓ down                                 │
-  │  bottleneck (H/4×W/4, 8ch)             │
-  │                                         │
-  │  Decoder                                │
-  │  ↑ up (nearest ×2) + skip enc1         │
-  │  dec1 (H/2×W/2, 4ch)                   │
-  │  ↑ up + skip enc0                       │
-  │  dec0 (H×W, 4ch)                        │
-  └─────────────────────────────────────────┘
-        │
-        ▼
-  output RGBA (H×W)
-```
+![CNN v3 U-Net + FiLM Architecture](cnn_v3_architecture.png)
 
 FiLM is applied **inside each encoder/decoder block**, after each convolution.
 
@@ -352,11 +326,11 @@ All f16, little-endian, same packing as v2 (`pack2x16float`).
 |-----------|---------|------|-----------|
 | enc0: Conv(20→4, 3×3) | 20×4×9=720 | +4 | 724 |
 | enc1: Conv(4→8, 3×3) | 4×8×9=288 | +8 | 296 |
-| bottleneck: Conv(8→8, 1×1) | 8×8×1=64 | +8 | 72 |
+| bottleneck: Conv(8→8, 3×3, dil=2) | 8×8×9=576 | +8 | 584 |
 | dec1: Conv(16→4, 3×3) | 16×4×9=576 | +4 | 580 |
 | dec0: Conv(8→4, 3×3) | 8×4×9=288 | +4 | 292 |
 | FiLM MLP (5→16→40) | 5×16+16×40=720 | +16+40 | 776 |
-| **Total** | | | **~3.9 KB f16** |
+| **Total conv** | | | **~4.84 KB f16** |
 
 Skip connections: dec1 input = 8ch (bottleneck) + 8ch (enc1 skip) = 16ch.
 dec0 input = 4ch (dec1) + 4ch (enc0 skip) = 8ch.
@@ -541,7 +515,7 @@ class CNNv3(nn.Module):
             nn.Conv2d(enc_channels[0], enc_channels[1], 3, padding=1),
         ])
         # Bottleneck
-        self.bottleneck = nn.Conv2d(enc_channels[1], enc_channels[1], 1)
+        self.bottleneck = nn.Conv2d(enc_channels[1], enc_channels[1], 3, padding=2, dilation=2)
         # Decoder (skip connections: concat → double channels)
         self.dec = nn.ModuleList([
             nn.Conv2d(enc_channels[1]*2, enc_channels[0], 3, padding=1),
@@ -709,7 +683,7 @@ Parity results:
 Pass 0: pack_gbuffer.wgsl         — assemble G-buffer channels into storage texture
 Pass 1: cnn_v3_enc0.wgsl          — encoder level 0 (20→4ch, 3×3)
 Pass 2: cnn_v3_enc1.wgsl          — encoder level 1 (4→8ch, 3×3) + downsample
-Pass 3: cnn_v3_bottleneck.wgsl    — bottleneck (8→8, 1×1)
+Pass 3: cnn_v3_bottleneck.wgsl    — bottleneck (8→8, 3×3, dilation=2)
 Pass 4: cnn_v3_dec1.wgsl          — decoder level 1: upsample + skip + (16→4, 3×3)
 Pass 5: cnn_v3_dec0.wgsl          — decoder level 0: upsample + skip + (8→4, 3×3)
 Pass 6: cnn_v3_output.wgsl        — sigmoid + composite to framebuffer
@@ -816,7 +790,7 @@ Status bar shows which channels are loaded.
 | `PACK_SHADER` | `STATIC_SHADER` | 20ch into feat_tex0 + feat_tex1 (rgba32uint each) |
 | `ENC0_SHADER` | part of `CNN_SHADER` | Conv(20→4, 3×3) + FiLM + ReLU; writes enc0_tex |
 | `ENC1_SHADER` | | Conv(4→8, 3×3) + FiLM + ReLU + avg_pool2×2; writes enc1_tex (half-res) |
-| `BOTTLENECK_SHADER` | | Conv(8→8, 1×1) + FiLM + ReLU; writes bn_tex |
+| `BOTTLENECK_SHADER` | | Conv(8→8, 3×3, dilation=2) + ReLU; writes bn_tex |
 | `DEC1_SHADER` | | nearest upsample×2 + concat(bn, enc1_skip) + Conv(16→4, 3×3) + FiLM + ReLU |
 | `DEC0_SHADER` | | nearest upsample×2 + concat(dec1, enc0_skip) + Conv(8→4, 3×3) + FiLM + ReLU |
 | `OUTPUT_SHADER` | | Conv(4→4, 1×1) + sigmoid → composites to canvas |
diff --git a/cnn_v3/docs/HOWTO.md b/cnn_v3/docs/HOWTO.md
index 5cfc371..9a3efdf 100644
--- a/cnn_v3/docs/HOWTO.md
+++ b/cnn_v3/docs/HOWTO.md
@@ -233,12 +233,13 @@ channel-dropout training.
 
 ```bash
 python3 cnn_v3/training/pack_photo_sample.py \
-    --photo cnn_v3/training/input/photo1.jpg \
+    --photo  input/photo1.jpg \
+    --target target/photo1_styled.png \
     --output dataset/photos/sample_001/
 ```
 
-The output `target.png` defaults to the input photo (no style). Copy in
-your stylized version as `target.png` before training.
+`--target` is required and must be a stylized ground-truth image at the same
+resolution as the photo. The script writes it as `target.png` in the sample dir.
 
 ### Dataset layout
 
@@ -285,10 +286,31 @@ python3 train_cnn_v3.py \
     --patch-size 32 --detector random
 ```
 
+### Single-sample training
+
+Use `--single-sample <dir>` to train on one specific sample directory.
+Implies `--full-image` and `--batch-size 1` automatically.
+
+```bash
+# Pack input/target pair into a sample directory first
+python3 pack_photo_sample.py \
+    --photo  input/photo1.png \
+    --target target/photo1_styled.png \
+    --output dataset/simple/sample_001/
+
+# Train on that sample only
+python3 train_cnn_v3.py \
+    --single-sample dataset/simple/sample_001/ \
+    --epochs 500
+```
+
+All other flags (`--epochs`, `--lr`, `--checkpoint-dir`, `--enc-channels`, etc.) work normally.
+
 ### Key flags
 
 | Flag | Default | Notes |
 |------|---------|-------|
+| `--single-sample DIR` | — | Train on one sample dir; implies `--full-image`, `--batch-size 1` |
 | `--input DIR` | `training/dataset` | Root with `full/` or `simple/` subdirs |
 | `--input-mode` | `simple` | `simple`=photos, `full`=Blender G-buffer |
 | `--patch-size N` | `64` | Patch crop size |
@@ -417,10 +439,10 @@ FiLM γ/β are computed CPU-side by the FiLM MLP (Phase 4) and uploaded each fra
 |-------|---------|------|-----------|
 | enc0  | 20×4×9=720 | +4 | 724 |
 | enc1  | 4×8×9=288  | +8 | 296 |
-| bottleneck | 8×8×1=64 | +8 | 72 |
+| bottleneck | 8×8×9=576 | +8 | 584 |
 | dec1  | 16×4×9=576 | +4 | 580 |
 | dec0  | 8×4×9=288  | +4 | 292 |
-| **Total** | | | **2064 f16 = ~4 KB** |
+| **Total** | | | **2476 f16 = ~4.84 KB** |
 
 **Asset IDs** (registered in `workspaces/main/assets.txt` + `src/effects/shaders.cc`):
 `SHADER_CNN_V3_COMMON`, `SHADER_CNN_V3_ENC0`, `SHADER_CNN_V3_ENC1`,
@@ -587,9 +609,145 @@ Visualization panel still works.
 
 ---
 
-## 10. See Also
+## 10. Python / WGSL Parity Check (infer_cnn_v3 + cnn_test)
+
+Two complementary tools for comparing PyTorch inference against the live WGSL
+compute shaders on the same input image.
+
+### 10a. infer_cnn_v3.py — PyTorch reference inference
+
+**Location:** `cnn_v3/training/infer_cnn_v3.py`
+
+Runs the trained `CNNv3` model in Python and saves the RGBA output as PNG.
+
+**Simple mode** (single PNG, geometry zeroed):
+```bash
+cd cnn_v3/training
+python3 infer_cnn_v3.py photo.png out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth
+```
+
+**Full mode** (sample directory with all G-buffer files):
+```bash
+python3 infer_cnn_v3.py dataset/simple/sample_000/ out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth
+```
+
+**Identity FiLM** — bypass MLP, use γ=1 β=0 (matches C++ `cnn_test` default):
+```bash
+python3 infer_cnn_v3.py photo.png out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth \
+    --identity-film
+```
+
+**Options:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--checkpoint CKPT` | auto-find latest | Path to `.pth` checkpoint |
+| `--enc-channels C` | from checkpoint | `4,8` — must match training config |
+| `--cond F F F F F` | `0 0 0 0 0` | FiLM conditioning (beat_phase, beat_norm, audio, style0, style1) |
+| `--identity-film` | off | Bypass FiLM MLP, use γ=1 β=0 |
+| `--blend F` | `1.0` | Blend with albedo: 0=input, 1=CNN |
+| `--debug-hex` | off | Print first 8 output pixels as hex |
+
+In **simple mode**, geometry channels are zeroed: `normal=(0.5,0.5)` (oct-encodes
+to ≈(0,0,1)), `depth=0`, `matid=0`, `shadow=1`, `transp=0`.
+
+The checkpoint `config` dict (saved by `train_cnn_v3.py`) sets `enc_channels`
+and `film_cond_dim` automatically; `--enc-channels` is only needed if the
+checkpoint lacks a config key.
+
+---
+
+### 10b. cnn_test — WGSL / GPU reference inference
+
+**Location:** `tools/cnn_test.cc`  **Binary:** `build/cnn_test`
+
+Packs the same 20-channel feature tensor as `infer_cnn_v3.py`, uploads it to
+GPU, runs the five `CNNv3Effect` compute passes, and saves the RGBA16Float
+output as PNG.
+
+**Build** (requires `DEMO_BUILD_TESTS=ON` or `DEMO_WORKSPACE=main`):
+```bash
+cmake -B build -DDEMO_BUILD_TESTS=ON && cmake --build build -j4 --target cnn_test
+```
+
+**Simple mode:**
+```bash
+./build/cnn_test photo.png out_gpu.png --weights workspaces/main/weights/cnn_v3_weights.bin
+```
+
+**Full mode** (sample directory):
+```bash
+./build/cnn_test dataset/simple/sample_000/albedo.png out_gpu.png \
+    --sample-dir dataset/simple/sample_000/ \
+    --weights workspaces/main/weights/cnn_v3_weights.bin
+```
+
+**Options:**
+
+| Flag | Description |
+|------|-------------|
+| `--sample-dir DIR` | Load all G-buffer files (albedo/normal/depth/matid/shadow/transp) |
+| `--weights FILE` | `cnn_v3_weights.bin` (uses asset-embedded weights if omitted) |
+| `--debug-hex` | Print first 8 output pixels as hex |
+| `--help` | Show usage |
+
+FiLM is always **identity** (γ=1, β=0) — matching the C++ `CNNv3Effect` default
+until GPU-side FiLM MLP evaluation is added.
+
+---
+
+### 10c. Side-by-side comparison
+
+For a pixel-accurate comparison, use `--identity-film` in Python and `--debug-hex`
+in both tools:
+
+```bash
+cd cnn_v3/training
+
+# 1. Python inference (identity FiLM)
+python3 infer_cnn_v3.py photo.png out_python.png \
+    --checkpoint checkpoints/checkpoint_epoch_200.pth \
+    --identity-film --debug-hex
+
+# 2. GPU inference (always identity FiLM)
+./build/cnn_test photo.png out_gpu.png \
+    --weights workspaces/main/weights/cnn_v3_weights.bin \
+    --debug-hex
+```
+
+Both tools print first 8 pixels in the same format:
+```
+  [0] 0x7F804000  (0.4980 0.5020 0.2510 0.0000)
+```
+
+**Expected delta:** ≤ 1/255 (≈ 4e-3) per channel, matching the parity test
+(`test_cnn_v3_parity`). Larger deltas indicate a weight mismatch — re-export
+with `export_cnn_v3_weights.py` and verify the `.bin` size is 4952 bytes.
+
+---
+
+### 10d. Feature format note
+
+Both tools pack features in **training format** ([0,1] oct-encoded normals),
+not the runtime `gbuf_pack.wgsl` format (which remaps normals to [-1,1]).
+This makes `infer_cnn_v3.py` ↔ `cnn_test` directly comparable.
+
+The live pipeline (`GBufferEffect → gbuf_pack.wgsl → CNNv3Effect`) uses [-1,1]
+normals — that is the intended inference distribution after a full training run
+with `--input-mode full` (Blender renders). For training on photos
+(`--input-mode simple`), [0,1] normals are correct since channel dropout
+teaches the network to handle absent geometry.
+
+---
+
+## 11. See Also
 
 - `cnn_v3/docs/CNN_V3.md` — Full architecture design (U-Net, FiLM, feature layout)
 - `doc/EFFECT_WORKFLOW.md` — General effect integration guide
 - `cnn_v2/docs/CNN_V2.md` — Reference implementation (simpler, operational)
 - `src/tests/gpu/test_demo_effects.cc` — GBufferEffect + GBufViewEffect tests
+- `src/tests/gpu/test_cnn_v3_parity.cc` — Zero/random weight parity tests
+- `cnn_v3/training/export_cnn_v3_weights.py` — Export trained checkpoint → `.bin`
diff --git a/cnn_v3/docs/HOW_TO_CNN.md b/cnn_v3/docs/HOW_TO_CNN.md
index 4966a61..09db97c 100644
--- a/cnn_v3/docs/HOW_TO_CNN.md
+++ b/cnn_v3/docs/HOW_TO_CNN.md
@@ -28,26 +28,13 @@ CNN v3 is a 2-level U-Net with FiLM conditioning, designed to run in real-time a
 
 **Architecture:**
 
-```
-Input: 20-channel G-buffer feature textures (rgba32uint)
-  │
-  enc0 ──── Conv(20→4, 3×3) + FiLM + ReLU         ┐ full res
-  │    ↘ skip                                       │
-  enc1 ──── AvgPool2×2 + Conv(4→8, 3×3) + FiLM    ┐ ½ res
-  │    ↘ skip                                       │
-  bottleneck AvgPool2×2 + Conv(8→8, 1×1) + ReLU   ¼ res (no FiLM)
-  │                                                 │
-  dec1 ←── upsample×2 + cat(enc1 skip) + Conv(16→4, 3×3) + FiLM
-  │                                                 │ ½ res
-  dec0 ←── upsample×2 + cat(enc0 skip) + Conv(8→4, 3×3) + FiLM + sigmoid
-                                                    full res → RGBA output
-```
+![CNN v3 U-Net + FiLM Architecture](cnn_v3_architecture.png)
 
 **FiLM MLP:** `Linear(5→16) → ReLU → Linear(16→40)` trained jointly with U-Net.
 - Input: `[beat_phase, beat_norm, audio_intensity, style_p0, style_p1]`
 - Output: 40 γ/β values controlling style across all 4 FiLM layers
 
-**Weight budget:** ~3.9 KB f16 (fits ≤6 KB target)
+**Weight budget:** ~4.84 KB f16 conv (fits ≤6 KB target)
 
 **Two data paths:**
 - **Simple mode** — real photos with zeroed geometric channels (normal, depth, matid)
@@ -58,13 +45,13 @@ Input: 20-channel G-buffer feature textures (rgba32uint)
 ```
 photos/Blender → pack → dataset/ → train_cnn_v3.py → checkpoint.pth
                                                             │
-                                           export_cnn_v3_weights.py
-                                                  ┌─────────┴──────────┐
-                                          cnn_v3_weights.bin    cnn_v3_film_mlp.bin
-                                                  │
-                                     CNNv3Effect::upload_weights()
-                                                  │
-                                         demo / HTML tool
+                                           export_cnn_v3_weights.py [--html]
+                                      ┌──────────┴────────────┬──────────────┐
+                               cnn_v3_weights.bin   cnn_v3_film_mlp.bin   weights.js
+                                      │                                  (HTML tool
+                         CNNv3Effect::upload_weights()                    defaults)
+                                      │
+                                    demo
 ```
 
 ---
@@ -107,15 +94,6 @@ The network learns the mapping `albedo → target`. If you pass the same image a
 input and target, the network learns identity (useful as sanity check, not for real
 training). Confirm `target.png` looks correct before running training.
 
-**Alternative — pack without a target yet:**
-```bash
-python3 pack_photo_sample.py \
-    --photo /path/to/photo.png \
-    --output dataset/simple/sample_001/
-# target.png defaults to a copy of the input; replace it before training:
-cp my_stylized_version.png dataset/simple/sample_001/target.png
-```
-
 **Batch packing:**
 ```bash
 for f in photos/*.png; do
@@ -284,55 +262,78 @@ The U-Net conv weights and FiLM MLP train **jointly** in a single run. No separa
 
 ### Prerequisites
 
+`train_cnn_v3.py` and `export_cnn_v3_weights.py` carry inline `uv` dependency metadata
+(`# /// script`). Use `uv run` — no manual `pip install` needed:
+
 ```bash
-pip install torch torchvision pillow numpy opencv-python
 cd cnn_v3/training
+uv run train_cnn_v3.py --input dataset/ --epochs 1 --patch-size 32 --detector random
 ```
 
-**With `uv` (no pip needed):** dependencies are declared inline in `train_cnn_v3.py`
-and installed automatically on first run:
+**Without `uv` (manual pip):**
 ```bash
+pip install torch torchvision pillow numpy opencv-python
 cd cnn_v3/training
-uv run train_cnn_v3.py --input dataset/ --epochs 1 --patch-size 32 --detector random
+python3 train_cnn_v3.py ...
 ```
 
+The pack scripts (`pack_photo_sample.py`, `pack_blender_sample.py`) and
+`gen_test_vectors.py` do **not** have uv metadata — run them with `python3` directly
+(they only need `numpy`, `pillow`, and optionally `openexr`).
+
 ### Quick-start commands
 
 **Smoke test — 1 epoch, validates end-to-end without GPU:**
 ```bash
-python3 train_cnn_v3.py --input dataset/ --epochs 1 \
+uv run train_cnn_v3.py --input dataset/ --epochs 1 \
     --patch-size 32 --detector random
 ```
 
 **Standard photo training (patch-based):**
 ```bash
-python3 train_cnn_v3.py \
+uv run train_cnn_v3.py \
     --input dataset/ \
     --input-mode simple \
-    --epochs 200
+    --epochs 200 \
+    --edge-loss-weight 0.1 \
+    --film-warmup-epochs 50
 ```
 
 **Blender G-buffer training:**
 ```bash
-python3 train_cnn_v3.py \
+uv run train_cnn_v3.py \
     --input dataset/ \
     --input-mode full \
-    --epochs 200
+    --epochs 200 \
+    --edge-loss-weight 0.1 \
+    --film-warmup-epochs 50
 ```
 
 **Full-image mode (better global coherence, slower):**
 ```bash
-python3 train_cnn_v3.py \
+uv run train_cnn_v3.py \
     --input dataset/ \
     --input-mode full \
     --full-image --image-size 256 \
     --epochs 500
 ```
 
+**Single-sample training (overfit on one input/target pair):**
+```bash
+# Pack first
+./gen_sample.sh input/photo.png target/photo_styled.png dataset/simple/sample_001/
+
+# Train — --full-image and --batch-size 1 are implied
+uv run train_cnn_v3.py \
+    --single-sample dataset/simple/sample_001/ \
+    --epochs 500
+```
+
 ### Flag reference
 
 | Flag | Default | Notes |
 |------|---------|-------|
+| `--single-sample DIR` | — | Train on one sample dir; implies `--full-image`, `--batch-size 1` |
 | `--input DIR` | `training/dataset` | Dataset root; always set explicitly |
 | `--input-mode` | `simple` | `simple`=photos, `full`=Blender G-buffer |
 | `--epochs N` | 200 | 500 recommended for full-image mode |
@@ -340,7 +341,8 @@ python3 train_cnn_v3.py \
 | `--lr F` | 1e-3 | Reduce to 1e-4 if loss oscillates or NaN |
 | `--patch-size N` | 64 | Smaller = faster epoch, less spatial context |
 | `--patches-per-image N` | 256 | Reduce for small datasets |
-| `--detector` | `harris` | `random` for smoke tests; `shi-tomasi` as alternative |
+| `--detector` | `harris` | `random` for smoke tests; also `shi-tomasi`, `fast`, `gradient` |
+| `--patch-search-window N` | 0 | Search ±N px in target to find best alignment (grayscale MSE) per patch; 0=disabled. Use when source and target are not perfectly co-registered (e.g. photo + hand-painted target). Offsets cached at dataset init. |
 | `--channel-dropout-p F` | 0.3 | Lower if all samples have geometry (Blender only) |
 | `--full-image` | off | Resize full image instead of patch crops |
 | `--image-size N` | 256 | Resize target; only used with `--full-image` |
@@ -348,12 +350,15 @@ python3 train_cnn_v3.py \
 | `--film-cond-dim N` | 5 | Must match `CNNv3FiLMParams` field count in C++ |
 | `--checkpoint-dir DIR` | `checkpoints/` | Set per-experiment |
 | `--checkpoint-every N` | 50 | 0 to disable intermediate checkpoints |
+| `--resume [CKPT]` | — | Resume from checkpoint path; if path missing, uses latest in `--checkpoint-dir` |
+| `--edge-loss-weight F` | 0.1 | Sobel gradient loss weight alongside MSE; improves style/edge capture; 0=MSE only |
+| `--film-warmup-epochs N` | 50 | Freeze FiLM MLP for first N epochs (phase-1), then unfreeze at lr×0.1; 0=joint training |
 
 ### Architecture at startup
 
 The model prints its parameter count:
 ```
-Model: enc=[4, 8]  film_cond_dim=5  params=2740  (~5.4 KB f16)
+Model: enc=[4, 8]  film_cond_dim=5  params=3252  (~6.4 KB f16)
 ```
 
 If `params` is much higher, `--enc-channels` was changed; update C++ constants accordingly.
@@ -454,17 +459,30 @@ The final checkpoint is always written even if `--checkpoint-every 0`.
 
 ## 3. Exporting Weights
 
-Converts a trained `.pth` checkpoint to two raw binary files for the C++ runtime.
+Converts a trained `.pth` checkpoint to two raw binary files for the C++ runtime,
+and optionally updates the HTML tool's embedded defaults.
 
+**Standard export (C++ runtime only):**
 ```bash
 cd cnn_v3/training
-python3 export_cnn_v3_weights.py checkpoints/checkpoint_epoch_200.pth \
+uv run export_cnn_v3_weights.py checkpoints/checkpoint_epoch_200.pth \
     --output ../../workspaces/main/weights/
 ```
 
+**Export + update HTML tool defaults (`cnn_v3/tools/weights.js`):**
+```bash
+uv run export_cnn_v3_weights.py checkpoints/checkpoint_epoch_200.pth \
+    --output ../../workspaces/main/weights/ \
+    --html
+```
+
+`--html` base64-encodes both `.bin` files and rewrites `cnn_v3/tools/weights.js`
+so the HTML tool loads the new weights as its embedded defaults at startup.
+Use `--html-output PATH` to write to a different `weights.js` location.
+
 Output files are registered in `workspaces/main/assets.txt` as:
 ```
-WEIGHTS_CNN_V3, BINARY, weights/cnn_v3_weights.bin, "CNN v3 conv weights (f16, 3928 bytes)"
+WEIGHTS_CNN_V3, BINARY, weights/cnn_v3_weights.bin, "CNN v3 conv weights (f16, 4952 bytes)"
 WEIGHTS_CNN_V3_FILM_MLP, BINARY, weights/cnn_v3_film_mlp.bin, "CNN v3 FiLM MLP weights (f32, 3104 bytes)"
 ```
 
@@ -476,10 +494,10 @@ WEIGHTS_CNN_V3_FILM_MLP, BINARY, weights/cnn_v3_film_mlp.bin, "CNN v3 FiLM MLP w
 |-------|-----------|-------|
 | enc0 Conv(20→4,3×3)+bias | 724 | — |
 | enc1 Conv(4→8,3×3)+bias | 296 | — |
-| bottleneck Conv(8→8,1×1)+bias | 72 | — |
+| bottleneck Conv(8→8,3×3,dil=2)+bias | 584 | — |
 | dec1 Conv(16→4,3×3)+bias | 580 | — |
 | dec0 Conv(8→4,3×3)+bias | 292 | — |
-| **Total** | **1964 f16** | **3928 bytes** |
+| **Total** | **2476 f16** | **4952 bytes** |
 
 **`cnn_v3_film_mlp.bin`** — FiLM MLP weights as raw f32, row-major:
 
@@ -509,8 +527,8 @@ Checkpoint: epoch=200  loss=0.012345
   enc_channels=[4, 8]  film_cond_dim=5
 
 cnn_v3_weights.bin
-  1964 f16 values → 982 u32 → 3928 bytes
-  Upload via CNNv3Effect::upload_weights(queue, data, 3928)
+  2476 f16 values → 1238 u32 → 4952 bytes
+  Upload via CNNv3Effect::upload_weights(queue, data, 4952)
 
 cnn_v3_film_mlp.bin
   L0: weight (16, 5) + bias (16,)
@@ -543,10 +561,12 @@ It owns:
 
 ```
 SEQUENCE 0 0 "Scene with CNN v3"
-  EFFECT + GBufferEffect prev_cnn -> gbuf_feat0 gbuf_feat1  0 60
-  EFFECT + CNNv3Effect   gbuf_feat0 gbuf_feat1 -> sink       0 60
+  EFFECT + GBufferEffect source -> gbuf_feat0 gbuf_feat1  0 60
+  EFFECT + CNNv3Effect   gbuf_feat0 gbuf_feat1 -> sink    0 60
 ```
 
+Temporal feedback (`prev_cnn`) is wired automatically by `wire_dag()` — no explicit input needed in the `.seq` file.
+
 Or direct C++:
 ```cpp
 #include "cnn_v3/src/cnn_v3_effect.h"
@@ -636,8 +656,8 @@ Do not reference them from outside the effect unless debugging.
 
 ```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Release
-cmake --build build -j$(nproc)
-./build/demo
+cmake --build build -j4
+./build/demo64k
 ```
 
 ### Expected visual output
@@ -733,13 +753,14 @@ If results drift after shader edits, verify these invariants match the Python re
 
 ## 7. HTML WebGPU Tool
 
-**Location:** `cnn_v3/tools/` — three files, no build step.
+**Location:** `cnn_v3/tools/` — four files, no build step.
 
 | File | Lines | Contents |
 |------|-------|----------|
-| `index.html` | 147 | HTML + CSS |
-| `shaders.js` | 252 | WGSL shader constants, weight-offset constants |
-| `tester.js` | 540 | `CNNv3Tester` class, event wiring |
+| `index.html` | 168 | HTML + CSS |
+| `shaders.js` | 312 | WGSL shader constants, weight-offset constants |
+| `tester.js` | 913 | `CNNv3Tester` class, inference pipeline, layer viz |
+| `weights.js` | 7 | Embedded default weights (base64); auto-generated by `--html` |
 
 ### Usage
 
@@ -750,32 +771,27 @@ python3 -m http.server 8080
 # Open: http://localhost:8080/cnn_v3/tools/
 ```
 
-Or on macOS with Chrome:
+Weights are **loaded automatically at startup** from `weights.js` (embedded base64).
+If the tool is served from the repo root, it also tries to fetch the latest
+`workspaces/main/weights/*.bin` over HTTP and uses those if available.
+Use the **↺ Reload** button to re-fetch after updating weights on disk.
+
+To update the embedded defaults after a training run, use `--html` (§3):
 ```bash
-open -a "Google Chrome" --args --allow-file-access-from-files
-open cnn_v3/tools/index.html
+uv run export_cnn_v3_weights.py checkpoints/checkpoint.pth \
+    --output ../../workspaces/main/weights/ --html
 ```
 
 ### Workflow
 
-1. **Drop `cnn_v3_weights.bin`** onto the left "weights" drop zone.
-2. **Drop a PNG or video** onto the centre canvas → CNN runs immediately.
-3. _(Optional)_ **Drop `cnn_v3_film_mlp.bin`** → FiLM sliders become active.
-4. Adjust **beat_phase / beat_norm / audio_int / style_p0 / style_p1** sliders → reruns on change.
-5. Click layer buttons (**Feat · Enc0 · Enc1 · BN · Dec1 · Output**) in the right panel to inspect activations.
-6. **Save PNG** to export the current output.
+1. **Drop a PNG or video** onto the canvas → CNN runs immediately (weights pre-loaded).
+2. Adjust **beat_phase / beat_norm / audio_int / style_p0 / style_p1** sliders.
+3. Click layer buttons (**Feat · Enc0 · Enc1 · BN · Dec1 · Output**) to inspect activations.
+4. **Save PNG** to export the current output.
+5. _(Optional)_ Drop updated `.bin` files onto the left panel to override embedded weights.
 
 Keyboard: `[SPACE]` toggle original · `[D]` diff×10.
 
-### Input files
-
-| File | Format | Notes |
-|------|--------|-------|
-| `cnn_v3_weights.bin` | raw u32 (no header) | 982 u32 = 1964 f16 = ~3.9 KB |
-| `cnn_v3_film_mlp.bin` | raw f32 | 776 f32 = 3.1 KB; optional — identity FiLM used if absent |
-
-Both produced by `export_cnn_v3_weights.py` (§3).
-
 ### Texture chain
 
 | Texture | Format | Size |
@@ -801,7 +817,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 ### Pitfalls
 
 - `rgba32uint` and `rgba16float` textures both need `STORAGE_BINDING | TEXTURE_BINDING` usage.
-- Weight offsets are **f16 indices** (enc0=0, enc1=724, bn=1020, dec1=1092, dec0=1672).
+- Weight offsets are **f16 indices** (enc0=0, enc1=724, bn=1020, dec1=1604, dec0=2184).
 - Uniform buffer layouts must match WGSL `Params` structs exactly (padding included).
 
 ---
@@ -816,7 +832,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 | `cnn_v3/training/pack_photo_sample.py` | Photo → zeroed-geometry sample directory |
 | `cnn_v3/training/cnn_v3_utils.py` | Dataset class, feature assembly, channel dropout, salient-point detection |
 | `cnn_v3/training/train_cnn_v3.py` | CNNv3 model definition, training loop, CLI |
-| `cnn_v3/training/export_cnn_v3_weights.py` | Checkpoint → `cnn_v3_weights.bin` + `cnn_v3_film_mlp.bin` |
+| `cnn_v3/training/export_cnn_v3_weights.py` | Checkpoint → `cnn_v3_weights.bin` + `cnn_v3_film_mlp.bin`; `--html` rewrites `weights.js` |
 | `cnn_v3/training/gen_test_vectors.py` | NumPy reference forward pass + C header generator |
 | `cnn_v3/test_vectors.h` | Compiled-in test vectors (auto-generated, do not edit) |
 | `cnn_v3/src/cnn_v3_effect.h` | C++ class, Params structs, `CNNv3FiLMParams` API |
@@ -827,6 +843,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 | `cnn_v3/tools/index.html` | HTML tool — UI shell + CSS |
 | `cnn_v3/tools/shaders.js` | HTML tool — inline WGSL shaders + weight-offset constants |
 | `cnn_v3/tools/tester.js` | HTML tool — CNNv3Tester class, inference pipeline, layer viz |
+| `cnn_v3/tools/weights.js` | HTML tool — embedded default weights (base64, auto-generated) |
 | `cnn_v2/tools/cnn_v2_test/index.html` | HTML tool reference pattern (v2) |
 
 ---
diff --git a/cnn_v3/docs/cnn_v3_architecture.png b/cnn_v3/docs/cnn_v3_architecture.png
new file mode 100644
index 0000000..2116c2b
--- /dev/null
+++ b/cnn_v3/docs/cnn_v3_architecture.png
diff --git a/cnn_v3/docs/gen_architecture_png.py b/cnn_v3/docs/gen_architecture_png.py
new file mode 100644
index 0000000..bd60a97
--- /dev/null
+++ b/cnn_v3/docs/gen_architecture_png.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = ["matplotlib"]
+# ///
+"""Generate CNN v3 U-Net + FiLM architecture diagram → cnn_v3_architecture.png"""
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from matplotlib.patches import FancyBboxPatch
+from matplotlib.path import Path
+import matplotlib.patheffects as pe
+
+# ---------------------------------------------------------------------------
+# Canvas
+# ---------------------------------------------------------------------------
+BG = '#0F172A'
+fig = plt.figure(figsize=(17, 10), facecolor=BG)
+ax  = fig.add_axes([0, 0, 1, 1], facecolor=BG)
+ax.set_xlim(0, 17)
+ax.set_ylim(0, 10)
+ax.axis('off')
+
+# ---------------------------------------------------------------------------
+# Palette
+# ---------------------------------------------------------------------------
+C_ENC  = '#3B82F6'   # encoder  — blue
+C_BN   = '#8B5CF6'   # bottleneck — violet
+C_DEC  = '#10B981'   # decoder  — emerald
+C_MLP  = '#EC4899'   # FiLM MLP — pink
+C_FILM = '#F59E0B'   # FiLM γ/β arrows — amber
+C_IO   = '#475569'   # input/output — slate
+C_SKP  = '#F97316'   # skip connections — orange
+C_ARR  = '#94A3B8'   # main flow arrows — cool-grey
+C_TXT  = '#F1F5F9'   # text — near-white
+C_DIM  = '#64748B'   # dim labels — slate
+
+# ---------------------------------------------------------------------------
+# Geometry — two-column U layout
+# ---------------------------------------------------------------------------
+EX, DX = 3.8, 13.2   # encoder / decoder centre-x
+BX     = 8.5          # bottleneck centre-x
+
+BW     = 4.6          # block width  (enc / dec)
+BH     = 0.95         # block height (enc / dec)
+BW_BN  = 5.4          # bottleneck wider
+BH_BN  = 0.95
+BH_IO  = 0.72
+
+# y positions (top = high number)
+Y_IN   = 8.90
+Y_E0   = 7.50   # enc0  full res
+Y_E1   = 5.80   # enc1  ½ res
+Y_BN   = 3.20   # bottleneck  ¼ res
+Y_D1   = 5.80   # dec1  ½ res
+Y_D0   = 7.50   # dec0  full res
+Y_OUT  = 8.90
+
+Y_MLP  = 1.25   # FiLM MLP
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def box(cx, cy, w, h, color, line1, line2='', lfs=9.5, sfs=8.0, alpha=0.92):
+    r = FancyBboxPatch((cx - w/2, cy - h/2), w, h,
+                       boxstyle='round,pad=0.10',
+                       fc=color, ec='white', lw=1.3, alpha=alpha, zorder=3)
+    ax.add_patch(r)
+    dy = 0.18 if line2 else 0
+    ax.text(cx, cy + dy, line1, ha='center', va='center',
+            fontsize=lfs, fontweight='bold', color='white', zorder=4,
+            fontfamily='DejaVu Sans Mono')
+    if line2:
+        ax.text(cx, cy - 0.18, line2, ha='center', va='center',
+                fontsize=sfs, color='white', alpha=0.80, zorder=4)
+
+
+def arrow(x0, y0, x1, y1, color=C_ARR, lw=1.8, dashed=False,
+          rad=0.0, label='', lx=None, ly=None):
+    ls = (0, (5, 3)) if dashed else 'solid'
+    cs = f'arc3,rad={rad}' if rad else 'arc3,rad=0'
+    ax.annotate('', xy=(x1, y1), xytext=(x0, y0),
+                arrowprops=dict(arrowstyle='->', color=color, lw=lw,
+                                linestyle=ls, mutation_scale=13,
+                                connectionstyle=cs),
+                zorder=2)
+    if label:
+        ax.text(lx if lx else (x0+x1)/2,
+                ly if ly else (y0+y1)/2,
+                label, ha='center', va='center', fontsize=7.5,
+                color=color, zorder=5,
+                bbox=dict(fc=BG, ec='none', alpha=0.75,
+                          boxstyle='round,pad=0.15'))
+
+
+def dim_label(x, y, txt):
+    ax.text(x, y, txt, ha='center', va='center',
+            fontsize=8.5, color=C_DIM, style='italic')
+
+
+# ---------------------------------------------------------------------------
+# Blocks
+# ---------------------------------------------------------------------------
+
+box(EX, Y_IN,  BW, BH_IO, C_IO,   'G-Buffer Features',
+    '20 channels  ·  full res')
+
+box(EX, Y_E0,  BW, BH,    C_ENC,  'enc0  Conv(20→4, 3×3) + FiLM + ReLU',
+    'full res  ·  4 ch')
+
+box(EX, Y_E1,  BW, BH,    C_ENC,  'enc1  Conv(4→8, 3×3) + FiLM + ReLU',
+    '½ res  ·  8 ch  ·  (AvgPool↓ on input)')
+
+box(BX, Y_BN,  BW_BN, BH_BN, C_BN,
+    'bottleneck  Conv(8→8, 3×3, dilation=2) + ReLU',
+    '¼ res  ·  8 ch  ·  no FiLM  ·  effective RF ≈ 10 px @ ½res')
+
+box(DX, Y_D1,  BW, BH,    C_DEC,  'dec1  Conv(16→4, 3×3) + FiLM + ReLU',
+    '½ res  ·  4 ch  ·  (upsample↑ + cat enc1 skip)')
+
+box(DX, Y_D0,  BW, BH,    C_DEC,  'dec0  Conv(8→4, 3×3) + FiLM + sigmoid',
+    'full res  ·  4 ch  ·  (upsample↑ + cat enc0 skip)')
+
+box(DX, Y_OUT, BW, BH_IO, C_IO,   'RGBA Output',
+    '4 channels  ·  full res')
+
+box(BX, Y_MLP, 9.2, 1.10, C_MLP,
+    'FiLM MLP   Linear(5→16) → ReLU → Linear(16→40)',
+    'in: beat_phase · beat_norm · audio_intensity · style_p0 · style_p1'
+    '     →     γ/β (×2) for enc0(4) enc1(8) dec1(4) dec0(4) = 40 values',
+    sfs=7.5)
+
+# ---------------------------------------------------------------------------
+# Main-flow arrows
+# ---------------------------------------------------------------------------
+
+# Input → enc0
+arrow(EX, Y_IN - BH_IO/2 - .04, EX, Y_E0 + BH/2 + .04)
+
+# enc0 → enc1   (AvgPool label beside)
+arrow(EX, Y_E0 - BH/2 - .04, EX, Y_E1 + BH/2 + .04,
+      label='AvgPool\n  2×2', lx=EX + 0.72, ly=(Y_E0 + Y_E1)/2)
+
+# enc1 → bottleneck  (curve down-right)
+arrow(EX, Y_E1 - BH/2 - .04,
+      BX - BW_BN/2 - .04, Y_BN,
+      rad=-0.28,
+      label='AvgPool\n  2×2', lx=(EX + BX)/2 - 0.5, ly=Y_BN + 0.90)
+
+# bottleneck → dec1  (curve right-up)
+arrow(BX + BW_BN/2 + .04, Y_BN,
+      DX, Y_D1 - BH/2 - .04,
+      rad=-0.28,
+      label='upsample\n   2×', lx=(BX + DX)/2 + 0.5, ly=Y_D1 - 0.90)
+
+# dec1 → dec0
+arrow(DX, Y_D1 + BH/2 + .04, DX, Y_D0 - BH/2 - .04,
+      label='upsample\n   2×', lx=DX - 0.72, ly=(Y_D1 + Y_D0)/2)
+
+# dec0 → output
+arrow(DX, Y_D0 + BH/2 + .04, DX, Y_OUT - BH_IO/2 - .04)
+
+# ---------------------------------------------------------------------------
+# Skip connections
+# ---------------------------------------------------------------------------
+
+# enc0 skip → dec0
+arrow(EX + BW/2 + .04, Y_E0,
+      DX - BW/2 - .04, Y_D0,
+      color=C_SKP, lw=1.6, dashed=True,
+      label='skip enc0  (4 ch)', ly=Y_E0 + 0.40)
+
+# enc1 skip → dec1
+arrow(EX + BW/2 + .04, Y_E1,
+      DX - BW/2 - .04, Y_D1,
+      color=C_SKP, lw=1.6, dashed=True,
+      label='skip enc1  (8 ch)', ly=Y_E1 + 0.40)
+
+# ---------------------------------------------------------------------------
+# FiLM γ/β arrows  (MLP → each FiLM layer)
+# ---------------------------------------------------------------------------
+film_targets = [
+    (EX, Y_E0 - BH/2 - .04),   # enc0 bottom
+    (EX, Y_E1 - BH/2 - .04),   # enc1 bottom
+    (DX, Y_D1 - BH/2 - .04),   # dec1 bottom
+    (DX, Y_D0 - BH/2 - .04),   # dec0 bottom
+]
+for tx, ty in film_targets:
+    ax.annotate('', xy=(tx, ty),
+                xytext=(BX + (tx - BX) * 0.05, Y_MLP + 0.55 + .04),
+                arrowprops=dict(arrowstyle='->', color=C_FILM, lw=1.2,
+                                linestyle=(0, (3, 3)), mutation_scale=10,
+                                connectionstyle='arc3,rad=0.18'),
+                zorder=2)
+
+ax.text(8.5, 4.30, 'γ / β', ha='center', va='center',
+        fontsize=9, color=C_FILM, alpha=0.85, style='italic', zorder=5)
+
+# ---------------------------------------------------------------------------
+# Resolution markers (left margin)
+# ---------------------------------------------------------------------------
+for y, lbl in [(Y_E0, 'full res'), (Y_E1, '½ res'), (Y_BN, '¼ res')]:
+    dim_label(0.62, y, lbl)
+    ax.plot([0.95, 1.10], [y, y], color=C_DIM, lw=0.8, zorder=1)
+
+# ---------------------------------------------------------------------------
+# Legend
+# ---------------------------------------------------------------------------
+legend_items = [
+    mpatches.Patch(fc=C_ENC, ec='white', lw=0.8, label='Encoder'),
+    mpatches.Patch(fc=C_BN,  ec='white', lw=0.8, label='Bottleneck'),
+    mpatches.Patch(fc=C_DEC, ec='white', lw=0.8, label='Decoder'),
+    mpatches.Patch(fc=C_MLP, ec='white', lw=0.8, label='FiLM MLP'),
+    mpatches.Patch(fc=C_IO,  ec='white', lw=0.8, label='I/O'),
+    plt.Line2D([0], [0], color=C_SKP, lw=1.6, ls='--', label='Skip connection'),
+    plt.Line2D([0], [0], color=C_FILM, lw=1.2, ls=(0, (3,3)), label='FiLM γ/β'),
+]
+leg = ax.legend(handles=legend_items, loc='lower right',
+                bbox_to_anchor=(0.99, 0.01),
+                framealpha=0.15, facecolor=BG, edgecolor=C_DIM,
+                fontsize=8, labelcolor=C_TXT, ncol=1)
+
+# ---------------------------------------------------------------------------
+# Title
+# ---------------------------------------------------------------------------
+ax.text(8.5, 9.68, 'CNN v3  —  U-Net + FiLM Architecture',
+        ha='center', va='center', fontsize=14, fontweight='bold', color=C_TXT)
+
+# ---------------------------------------------------------------------------
+# Save
+# ---------------------------------------------------------------------------
+import pathlib
+out = pathlib.Path(__file__).parent / 'cnn_v3_architecture.png'
+fig.savefig(out, dpi=180, bbox_inches='tight', facecolor=BG, edgecolor='none')
+print(f'Saved → {out}  ({out.stat().st_size // 1024} KB)')