16 files changed, 703 insertions, 216 deletions
diff --git a/cnn_v3/docs/CNN_V3.md b/cnn_v3/docs/CNN_V3.md
index 3f8f7db..4d58811 100644
--- a/cnn_v3/docs/CNN_V3.md
+++ b/cnn_v3/docs/CNN_V3.md
@@ -156,7 +156,7 @@ Depth gradient captures surface discontinuities and orientation cues for the CNN
 |-----|--------|--------|--------|--------|
 | [0] | mat_id | prev.r | prev.g | prev.b |
 | [1] | mip1.r | mip1.g | mip1.b | mip2.r |
-| [2] | mip2.g | mip2.b | shadow | transp. |
+| [2] | mip2.g | mip2.b | dif | transp. |
 | [3] | — spare — | | | |
 
 All packed via `pack4x8unorm`. Channels:
@@ -164,7 +164,7 @@ All packed via `pack4x8unorm`. Channels:
 - **prev.rgb**: previous CNN output (temporal feedback, recurrent)
 - **mip1.rgb**: albedo at MIP 1 (½ resolution) — medium-frequency color context
 - **mip2.rgb**: albedo at MIP 2 (¼ resolution) — low-frequency color context
-- **shadow**: shadow intensity [0=fully shadowed, 1=fully lit] from shadow pass
+- **dif**: pre-multiplied occluded diffuse = `max(0, dot(normal, KEY_LIGHT)) * shadow` [0=dark, 1=fully lit]
 - **transp.**: volumetric transparency [0=opaque, 1=transparent] for fog/smoke/volumetric light
 
 **Texture 1 is fully packed. u32[3] is reserved for future use.**
@@ -188,6 +188,8 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
   let transp = textureLoad(gbuf_transp, coord, 0).r;
   let mat_id = unpack_mat_id(nm);            // u8 from rg16float packing
   let normal = unpack_oct_normal(nm.rg);     // vec2f
+  let nor3   = oct_decode(normal);           // vec3f unit normal
+  let dif    = max(0.0, dot(nor3, KEY_LIGHT)) * shadow;  // ch18
 
   let mip1 = textureSampleLevel(gbuf_albedo, smplr, uv, 1.0).rgb;
   let mip2 = textureSampleLevel(gbuf_albedo, smplr, uv, 2.0).rgb;
@@ -202,7 +204,7 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
   textureStore(feat_tex1, coord, vec4u(
     pack4x8unorm(vec4(mat_id, prev.r,  prev.g,  prev.b)),
     pack4x8unorm(vec4(mip1.r, mip1.g,  mip1.b,  mip2.r)),
-    pack4x8unorm(vec4(mip2.g, mip2.b,  shadow,  transp)),
+    pack4x8unorm(vec4(mip2.g, mip2.b,  dif,     transp)),
     0u,
   ));
 }
@@ -232,7 +234,7 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
 | 15 | mip2.r | u8 | Albedo MIP 2 (¼ res) |
 | 16 | mip2.g | u8 | |
 | 17 | mip2.b | u8 | |
-| 18 | shadow | u8 | Shadow intensity [0=dark, 1=lit] |
+| 18 | dif | u8 | max(0,dot(normal,KEY_LIGHT))×shadow [0=dark, 1=lit] |
 | 19 | transp. | u8 | Volumetric transparency [0=opaque, 1=clear] |
 
 UV computed in-shader. Bias = 1.0 implicit (standard NN, not stored).
@@ -244,7 +246,7 @@ Plus prev_cnn texture (RGBA8): **8 MB**.
 
 ### 16-byte fallback (budget-constrained)
 
-Drop temporal, MIPs, shadow, transparency. Geometric data only:
+Drop temporal, MIPs, dif, transparency. Geometric data only:
 
 | u32 | channels |
 |-----|----------|
@@ -436,7 +438,7 @@ Missing channels are **zero-filled** — the network degrades gracefully due to
 | prev.rgb | **0, 0, 0** (no history) |
 | mip1.rgb | Computed from photo (pyrDown ×1) |
 | mip2.rgb | Computed from photo (pyrDown ×2) |
-| shadow | **1.0** (assume fully lit) |
+| dif | **1.0** (assume fully lit; no normal/shadow data) |
 | transp. | **1 − alpha** (from photo alpha channel, or 0 if no alpha) |
 
 mip1/mip2 are still meaningful (they come from albedo, which we have).
@@ -464,7 +466,7 @@ Applied per-sample during dataloader `__getitem__`:
 
 ```python
 GEOMETRIC_CHANNELS = [3, 4, 5, 6, 7]   # normal.xy, depth, depth_grad.xy
-CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, shadow, transp
+CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, dif, transp
 TEMPORAL_CHANNELS  = [9, 10, 11]        # prev.rgb
 
 def apply_channel_dropout(feat, p_geom=0.3, p_context=0.2, p_temporal=0.5):
@@ -834,7 +836,7 @@ FiLM γ/β computed JS-side from sliders (tiny MLP forward pass in JS), uploaded
 | `bn_tex` | W/2×H/2 | rgba32uint | 8 channels f16 (bottleneck output) |
 | `dec1_tex` | W×H | rgba32uint | 4 channels f16 (dec1 output) |
 | `dec0_tex` | W×H | rgba32uint | 4 channels f16 (dec0 output) |
-| `prev_tex` | W×H | rgba8unorm | previous CNN output (temporal) |
+| `prev_tex` | W×H | rgba16float | previous CNN output (temporal, `F16X8`) |
 
 Skip connections: enc0_tex and enc1_tex are **kept alive** across the full forward pass
 (not ping-ponged away). DEC1 and DEC0 read them directly.
@@ -977,7 +979,7 @@ Reuse from existing shaders:
 
 - [ ] `cmake/DemoSourceLists.cmake` — add `cnn_v3_effect.cc` to COMMON_GPU_EFFECTS
 - [ ] `src/gpu/demo_effects.h` — add `#include "effects/cnn_v3_effect.h"`
-- [ ] `workspaces/main/timeline.seq` — add `EFFECT + CNNv3Effect`
+- [x] `workspaces/main/timeline.seq` — add `EFFECT + CNNv3Effect` (done: cnn_v3_debug sequence)
 
 ---
 
diff --git a/cnn_v3/docs/GBUF_DIF_MIGRATION.md b/cnn_v3/docs/GBUF_DIF_MIGRATION.md
new file mode 100644
index 0000000..37dde0f
--- /dev/null
+++ b/cnn_v3/docs/GBUF_DIF_MIGRATION.md
@@ -0,0 +1,136 @@
+// cnn_v3/docs/GBUF_DIF_MIGRATION.md
+// Plan: replace G-buffer shadow channel with dif (diffuse × shadow)
+// Status: IN PROGRESS — Step 1 (WGSL) complete; Steps 2–5 pending
+
+# G-Buffer `shadow` → `dif` Migration Plan
+
+## Motivation
+
+The raw `shadow` channel (ch18) is less informative than `dif = max(0, dot(normal, light_dir)) * shadow`
+because `shadow` alone ignores the diffuse Lambert term. The CNN learns better when it receives
+the pre-multiplied occluded diffuse signal directly. `albedo` is already in ch0–2, so the CNN
+can reconstruct the full shaded color as `albedo * (ambient + dif)`.
+
+## Design Decision
+
+**Replace ch18 (`shadow`) with ch18 (`dif`) in-place. Channel count stays 20.**
+
+- `dif` is a scalar: `max(0, dot(normal, KEY_LIGHT)) * shadow`
+- KEY_LIGHT = normalize(1, 2, 1) = (0.408, 0.816, 0.408) — matches `gbuf_deferred.wgsl`
+- Stored at the same position (t1.z byte 2) → no weight shape change
+- `transp` stays at ch19 (t1.z byte 3)
+- t1.w reverts to 0 (spare)
+
+### Feature layout (20 channels, unchanged count)
+
+| ch | name     | type   | range    | source         |
+|----|----------|--------|----------|----------------|
+| 0  | alb.r    | f16    | [0,1]    | feat_tex0.x lo |
+| 1  | alb.g    | f16    | [0,1]    | feat_tex0.x hi |
+| 2  | alb.b    | f16    | [0,1]    | feat_tex0.y lo |
+| 3  | nrm.x    | f16    | [-1,1]   | feat_tex0.y hi |
+| 4  | nrm.y    | f16    | [-1,1]   | feat_tex0.z lo |
+| 5  | depth    | f16    | [0,1]    | feat_tex0.z hi |
+| 6  | dzdx     | f16    | (signed) | feat_tex0.w lo |
+| 7  | dzdy     | f16    | (signed) | feat_tex0.w hi |
+| 8  | mat_id   | u8     | [0,1]    | feat_tex1.x[0] |
+| 9  | prev.r   | u8     | [0,1]    | feat_tex1.x[1] |
+| 10 | prev.g   | u8     | [0,1]    | feat_tex1.x[2] |
+| 11 | prev.b   | u8     | [0,1]    | feat_tex1.x[3] |
+| 12 | mip1.r   | u8     | [0,1]    | feat_tex1.y[0] |
+| 13 | mip1.g   | u8     | [0,1]    | feat_tex1.y[1] |
+| 14 | mip1.b   | u8     | [0,1]    | feat_tex1.y[2] |
+| 15 | mip2.r   | u8     | [0,1]    | feat_tex1.y[3] |
+| 16 | mip2.g   | u8     | [0,1]    | feat_tex1.z[0] |
+| 17 | mip2.b   | u8     | [0,1]    | feat_tex1.z[1] |
+| 18 | **dif**  | u8     | [0,1]    | feat_tex1.z[2] ← was shadow |
+| 19 | transp   | u8     | [0,1]    | feat_tex1.z[3] |
+
+---
+
+## Current State (intermediate — needs fixing)
+
+The commit tagged `wip(cnn_v3): shadow→dif intermediate` contains partial work.
+The WGSL changes are **incorrect** — `dif` is redundantly stored in t1.w (3×) and
+`shadow` was dropped from t1.z without putting `dif` in its place.
+
+### What is wrong
+
+| File | Problem |
+|---|---|
+| `gbuf_pack.wgsl` | t1.z = `mip2.g\|mip2.b\|transp\|spare` (shadow removed, dif not put there); t1.w = `dif\|dif\|dif\|spare` (redundant) |
+| `gbuf_deferred.wgsl` | reads `dif` from `t1.w.x` — should be `t1.z.z` |
+| `gbuf_view.wgsl` | expanded to 4×6 grid with ch20–22 as dif.rgb — should stay 4×5, ch18=dif |
+
+---
+
+## Implementation Checklist
+
+### Step 1 — Fix WGSL (correct the in-place swap) ✅
+
+- [x] `cnn_v3/shaders/gbuf_pack.wgsl`
+  - t1.z: `pack4x8unorm(vec4f(mip2.g, mip2.b, dif, transp))` ← dif at byte 2
+  - t1.w: `0u` ← revert to spare
+  - Remove comment line about t1.w dif
+
+- [x] `cnn_v3/shaders/gbuf_deferred.wgsl`
+  - Read: `let dif = unpack4x8unorm(t1.z).z;` ← from t1.z byte 2
+
+- [x] `cnn_v3/shaders/gbuf_view.wgsl`
+  - Revert to 4×5 grid (ROWS = 5.0)
+  - Guard: `ch >= 20u`
+  - ch18 label: `dif` (4 chars: 0x64696600)
+  - ch19 label: `trns` (unchanged)
+  - Remove row-5 cases (20u, 21u, default→dif.b)
+  - Revert `else if (comp_idx == 2u)` → `else` (drop t1.w branch)
+  - Update header comment
+
+- [x] `cnn_v3/shaders/cnn_v3_enc0.wgsl`
+  - Verify `load_feat()`: g = unpack4x8unorm(t1.z) → g.z = ch18 = dif ✓ (no change needed)
+
+### Step 2 — Python training ✅
+
+- [x] `cnn_v3/training/cnn_v3_utils.py`
+  - Added `oct_decode()` helper and `_KEY_LIGHT` constant
+  - `assemble_features()`: ch18 = `dif` computed on-the-fly
+  - Replace `shadow[..., None]` with `dif[..., None]` at index 18
+  - `CONTEXT_CHANNELS = [8, 18, 19]` — same indices, updated comment
+
+- [ ] `cnn_v3/training/pack_blender_sample.py`
+  - Optional: save `dif.png` (precomputed) alongside existing passes
+  - Not strictly required if utils.py computes on-the-fly
+
+### Step 3 — Web tool ✅
+
+- [x] `cnn_v3/tools/shaders.js` (FULL_PACK_SHADER)
+  - Add `oct_decode` inline (or inline the math)
+  - Compute `let dif = max(0., dot(oct_decode(nrm), vec3f(0.408, 0.816, 0.408))) * shd`
+  - Pack: t1.z = `pack4x8unorm(vec4f(m2.g, m2.b, dif, trp))`
+  - t1.w = `0u`
+
+### Step 4 — Test vectors
+
+- [ ] Re-run `cnn_v3/training/gen_test_vectors.py` to regenerate `test_vectors.h`
+  - ch18 value changes (dif ≠ shadow in general); old vectors are invalid
+  - Parity threshold (4.88e-4) should be unchanged
+
+### Step 5 — Docs ✅
+
+- [x] `cnn_v3/docs/CNN_V3.md` — feature table, pack pseudo-code, simple-mode defaults, CONTEXT_CHANNELS comment
+- [x] `cnn_v3/docs/HOWTO.md` — outputs description, channel table, dropout comment, FULL_PACK_SHADER description
+- [x] This file: all steps marked complete
+
+---
+
+## Architecture Impact
+
+| Dimension | Before | After |
+|---|---|---|
+| Channel count | 20 | 20 ✅ |
+| Weight shapes | Conv(20→4, ...) | Conv(20→4, ...) ✅ |
+| Total f16 weights | 1964 | 1964 ✅ |
+| Training data regen | — | Not required ✅ |
+| Parity test vectors | Valid | Must regenerate ❌ |
+| Existing trained weights | Valid | Invalidated (ch18 distribution changes) ❌ |
+
+No real training pass has occurred yet, so weight invalidation is not a concern.
diff --git a/cnn_v3/docs/HOWTO.md b/cnn_v3/docs/HOWTO.md
index 5c5cc2a..5cfc371 100644
--- a/cnn_v3/docs/HOWTO.md
+++ b/cnn_v3/docs/HOWTO.md
@@ -79,7 +79,7 @@ Each frame, `GBufferEffect::render()` executes:
 3. **Pass 3 — Transparency** — TODO (deferred; transp=0 for opaque scenes)
 
 4. **Pass 4 — Pack compute** (`gbuf_pack.wgsl`) ✅
-   - Reads all G-buffer textures + `prev_cnn` input
+   - Reads all G-buffer textures + persistent `prev_cnn` texture
    - Writes `feat_tex0` + `feat_tex1` (rgba32uint, 20 channels, 32 bytes/pixel)
    - Shadow / transp nodes cleared to 1.0 / 0.0 via zero-draw render passes
      until Pass 2/3 are implemented.
@@ -90,9 +90,38 @@ Outputs are named from the `outputs` vector passed to the constructor:
 
 ```
 outputs[0]  → feat_tex0   (rgba32uint: albedo.rgb, normal.xy, depth, depth_grad.xy)
-outputs[1]  → feat_tex1   (rgba32uint: mat_id, prev.rgb, mip1.rgb, mip2.rgb, shadow, transp)
+outputs[1]  → feat_tex1   (rgba32uint: mat_id, prev.rgb, mip1.rgb, mip2.rgb, dif, transp)
 ```
 
+### Temporal feedback (prev.rgb)
+
+`GBufferEffect` owns a persistent internal node `<prefix>_prev` (`F16X8` = Rgba16Float,
+`CopySrc|CopyDst`).  Each frame it is GPU-copied from the CNN effect's output after all
+effects render (`post_render`), then bound as `prev_cnn` in the pack shader (binding 6).
+
+**Wiring is automatic** via `wire_dag()`, called by `Sequence::init_effect_nodes()`.
+`GBufferEffect` scans the DAG for the first downstream consumer of its output nodes and
+uses that effect's output as `cnn_output_node_`.  No manual call needed.
+
+**Requirement**: the sequence must include `CNNv3Effect` downstream of `GBufferEffect`.
+In `timeline.seq`, declare a `gbuf_albedo` output node and add the effect:
+
+```seq
+NODE cnn_out gbuf_albedo
+EFFECT + GBufferEffect  source              -> gbuf_feat0 gbuf_feat1  0 60
+EFFECT + CNNv3Effect    gbuf_feat0 gbuf_feat1 -> cnn_out              0 60
+```
+
+If no CNN effect follows, `cnn_output_node_` stays empty and `post_render` is a no-op
+(prev.rgb will be zero — correct for static/debug-only sequences).
+
+Frame 0 behaviour: `_prev` is zeroed on allocation → `prev.rgb = 0`, matching the training
+convention (static frames use zero history).
+
+The copy uses `wgpuCommandEncoderCopyTextureToTexture` (no extra render pass overhead).
+`node_prev_tex_` is `F16X8` (Rgba16Float) to match the `GBUF_ALBEDO` format of CNNv3Effect's
+output — `CopyTextureToTexture` requires identical formats.
+
 ---
 
 ## 1b. GBufferEffect — Implementation Plan (Pass 2: SDF Shadow)
@@ -285,7 +314,7 @@ python3 train_cnn_v3.py \
 
 Applied per-sample in `cnn_v3_utils.apply_channel_dropout()`:
 - Geometric channels (normal, depth, depth_grad) zeroed with `p=channel_dropout_p`
-- Context channels (mat_id, shadow, transp) with `p≈0.2`
+- Context channels (mat_id, dif, transp) with `p≈0.2`
 - Temporal channels (prev.rgb) with `p=0.5`
 
 This ensures the network works for both full G-buffer and photo-only inputs.
@@ -299,10 +328,12 @@ This ensures the network works for both full G-buffer and photo-only inputs.
 ```seq
 # BPM 120
 SEQUENCE 0 0 "Scene with CNN v3"
-  EFFECT + GBufferEffect prev_cnn -> gbuf_feat0 gbuf_feat1  0 60
-  EFFECT + CNNv3Effect   gbuf_feat0 gbuf_feat1 -> sink       0 60
+  EFFECT + GBufferEffect source -> gbuf_feat0 gbuf_feat1  0 60
+  EFFECT + CNNv3Effect   gbuf_feat0 gbuf_feat1 -> sink    0 60
 ```
 
+Temporal feedback is wired automatically by `wire_dag()` — no manual call needed.
+
 FiLM parameters uploaded each frame:
 ```cpp
 cnn_v3_effect->set_film_params(
@@ -455,15 +486,15 @@ GBufViewEffect(const GpuContext& ctx,
                float start_time, float end_time)
 ```
 
-**Wiring example** (alongside GBufferEffect):
+**Wiring example** — use `timeline.seq`, temporal feedback wires automatically:
 
-```cpp
-auto gbuf  = std::make_shared<GBufferEffect>(ctx,
-    std::vector<std::string>{"prev_cnn"},
-    std::vector<std::string>{"gbuf_feat0", "gbuf_feat1"}, 0.0f, 60.0f);
-auto gview = std::make_shared<GBufViewEffect>(ctx,
-    std::vector<std::string>{"gbuf_feat0", "gbuf_feat1"},
-    std::vector<std::string>{"gbuf_view_out"}, 0.0f, 60.0f);
+```seq
+NODE gbuf_feat0 gbuf_rgba32uint
+NODE gbuf_feat1 gbuf_rgba32uint
+NODE cnn_out    gbuf_albedo
+EFFECT + GBufferEffect  source              -> gbuf_feat0 gbuf_feat1  0 60
+EFFECT + CNNv3Effect    gbuf_feat0 gbuf_feat1 -> cnn_out              0 60
+EFFECT + GBufViewEffect gbuf_feat0 gbuf_feat1 -> sink                 0 60
 ```
 
 **Grid layout** (output resolution = input resolution, channel cells each 1/4 W × 1/5 H):
@@ -474,7 +505,7 @@ auto gview = std::make_shared<GBufViewEffect>(ctx,
 | 1 | `nrm.y` remap→[0,1] | `depth` (inverted) | `dzdx` ×20+0.5 | `dzdy` ×20+0.5 |
 | 2 | `mat_id` | `prev.r` | `prev.g` | `prev.b` |
 | 3 | `mip1.r` | `mip1.g` | `mip1.b` | `mip2.r` |
-| 4 | `mip2.g` | `mip2.b` | `shadow` | `transp` |
+| 4 | `mip2.g` | `mip2.b` | `dif` | `transp` |
 
 All channels displayed as grayscale. 1-pixel gray grid lines separate cells. Dark background for out-of-range cells.
 
@@ -535,7 +566,7 @@ No sampler — all reads use `textureLoad()` (integer texel coordinates).
 
 Packs channels identically to `gbuf_pack.wgsl`:
 - `feat_tex0`: `pack2x16float(alb.rg)`, `pack2x16float(alb.b, nrm.x)`, `pack2x16float(nrm.y, depth)`, `pack2x16float(dzdx, dzdy)`
-- `feat_tex1`: `pack4x8unorm(matid,0,0,0)`, `pack4x8unorm(mip1.rgb, mip2.r)`, `pack4x8unorm(mip2.gb, shadow, transp)`
+- `feat_tex1`: `pack4x8unorm(matid,0,0,0)`, `pack4x8unorm(mip1.rgb, mip2.r)`, `pack4x8unorm(mip2.gb, dif, transp)`
 - Depth gradients: central differences on depth R channel
 - Mip1 / Mip2: box2 (2×2) / box4 (4×4) average filter on albedo
 
diff --git a/cnn_v3/docs/HOW_TO_CNN.md b/cnn_v3/docs/HOW_TO_CNN.md
index 458b68f..4966a61 100644
--- a/cnn_v3/docs/HOW_TO_CNN.md
+++ b/cnn_v3/docs/HOW_TO_CNN.md
@@ -97,7 +97,7 @@ It calls `pack_photo_sample.py` with both `--photo` and `--target` in a single s
 | `normal.png` | (128, 128, 0) uint8 | Neutral "no normal" → reconstructed (0,0,1) |
 | `depth.png` | All zeros uint16 | No depth data |
 | `matid.png` | All zeros uint8 | No material IDs |
-| `shadow.png` | 255 everywhere uint8 | Assume fully lit |
+| `shadow.png` | 255 everywhere uint8 | Assume fully lit (used to compute dif) |
 | `transp.png` | 1 − alpha uint8 | 0 = opaque |
 | `target.png` | Stylized target RGBA | Ground truth for training |
 
@@ -134,7 +134,7 @@ done
 
 ### 1b. From Blender (Full G-Buffer)
 
-Produces all 20 feature channels including normals, depth, mat IDs, and shadow.
+Produces all 20 feature channels including normals, depth, mat IDs, and dif (diffuse×shadow).
 
 #### Blender requirements
 
@@ -420,7 +420,7 @@ Applied per-sample to make the model robust to missing channels:
 | Channel group | Channels | Drop probability |
 |---------------|----------|-----------------|
 | Geometric | normal.xy, depth, depth_grad.xy [3,4,5,6,7] | `channel_dropout_p` (default 0.3) |
-| Context | mat_id, shadow, transp [8,18,19] | `channel_dropout_p × 0.67` (~0.2) |
+| Context | mat_id, dif, transp [8,18,19] | `channel_dropout_p × 0.67` (~0.2) |
 | Temporal | prev.rgb [9,10,11] | 0.5 (always) |
 
 This is why a model trained on Blender data also works on photos (geometry zeroed).
@@ -781,7 +781,7 @@ Both produced by `export_cnn_v3_weights.py` (§3).
 | Texture | Format | Size |
 |---------|--------|------|
 | `feat_tex0` | rgba32uint | W × H (8 f16: albedo, normal, depth, depth_grad) |
-| `feat_tex1` | rgba32uint | W × H (12 u8: mat_id, prev, mip1, mip2, shadow, transp) |
+| `feat_tex1` | rgba32uint | W × H (12 u8: mat_id, prev, mip1, mip2, dif, transp) |
 | `enc0_tex` | rgba16float | W × H |
 | `enc1_tex` | rgba32uint | W/2 × H/2 (8 f16 packed) |
 | `bn_tex` | rgba32uint | W/4 × H/4 |
@@ -790,7 +790,7 @@ Both produced by `export_cnn_v3_weights.py` (§3).
 
 ### Simple mode (photo input)
 
-Albedo = image RGB, mip1/mip2 from GPU mipmaps, shadow = 1.0, transp = 1 − alpha,
+Albedo = image RGB, mip1/mip2 from GPU mipmaps, dif = 1.0 (fully lit assumed), transp = 1 − alpha,
 all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 
 ### Browser requirements
@@ -843,7 +843,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 | 9–11 | prev.rgb | previous frame output | zero during training |
 | 12–14 | mip1.rgb | pyrdown(albedo) | f32 [0,1] |
 | 15–17 | mip2.rgb | pyrdown(mip1) | f32 [0,1] |
-| 18 | shadow | `shadow.png` | f32 [0,1] (1=lit) |
+| 18 | dif | computed | f32 [0,1] max(0,dot(normal,KEY_LIGHT))×shadow |
 | 19 | transp | `transp.png` | f32 [0,1] (0=opaque) |
 
 **Feature texture packing** (`feat_tex0` / `feat_tex1`, both `rgba32uint`):
@@ -858,6 +858,6 @@ feat_tex0 (4×u32 = 8 f16 channels via pack2x16float):
 feat_tex1 (4×u32 = 12 u8 channels + padding via pack4x8unorm):
   .x = pack4x8unorm(mat_id, prev.r,  prev.g,   prev.b)
   .y = pack4x8unorm(mip1.r, mip1.g,  mip1.b,   mip2.r)
-  .z = pack4x8unorm(mip2.g, mip2.b,  shadow,   transp)
+  .z = pack4x8unorm(mip2.g, mip2.b,  dif,      transp)
   .w = 0 (unused, 8 reserved channels)
 ```
diff --git a/cnn_v3/shaders/gbuf_deferred.wgsl b/cnn_v3/shaders/gbuf_deferred.wgsl
new file mode 100644
index 0000000..7257122
--- /dev/null
+++ b/cnn_v3/shaders/gbuf_deferred.wgsl
@@ -0,0 +1,48 @@
+// G-buffer deferred render — albedo * diffuse
+// Reads feat_tex0 (rgba32uint, f16 geometric).
+// Outputs albedo * (ambient + diffuse).
+
+#include "math/normal"
+
+@group(0) @binding(0) var feat_tex0: texture_2d<u32>;
+@group(0) @binding(1) var feat_tex1: texture_2d<u32>;
+@group(0) @binding(2) var<uniform> uniforms: GBufDeferredUniforms;
+
+struct GBufDeferredUniforms {
+    resolution: vec2f,
+}
+
+const KEY_LIGHT: vec3f = vec3f(0.408, 0.816, 0.408);  // normalize(1,2,1)
+const AMBIENT:   f32   = 0.15;
+
+@vertex
+fn vs_main(@builtin(vertex_index) vid: u32) -> @builtin(position) vec4f {
+    let x = f32((vid & 1u) << 2u) - 1.0;
+    let y = f32((vid & 2u) << 1u) - 1.0;
+    return vec4f(x, y, 0.0, 1.0);
+}
+
+@fragment
+fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
+    let coord = vec2i(pos.xy);
+
+    // feat_tex0 layout:
+    //   [0] pack2x16float(albedo.r, albedo.g)
+    //   [1] pack2x16float(albedo.b, normal.x)   oct [-1,1]
+    //   [2] pack2x16float(normal.y, depth)       oct [-1,1]
+    let t0     = textureLoad(feat_tex0, coord, 0);
+    let rg     = unpack2x16float(t0.x);
+    let bx     = unpack2x16float(t0.y);
+    let ny_d   = unpack2x16float(t0.z);
+    let albedo = vec3f(rg.x, rg.y, bx.x);
+
+    // Decode oct-normal (stored in [-1,1] — use oct_decode directly)
+    let normal  = oct_decode(vec2f(bx.y, ny_d.x));
+    let diffuse = max(0.0, dot(normal, KEY_LIGHT));
+
+    // feat_tex1[2] = pack4x8unorm(mip2.g, mip2.b, dif, transp) — dif at byte 2
+    let t1  = textureLoad(feat_tex1, coord, 0);
+    let dif = unpack4x8unorm(t1.z).z;
+
+    return vec4f(albedo * (AMBIENT + dif), 1.0);
+}
diff --git a/cnn_v3/shaders/gbuf_pack.wgsl b/cnn_v3/shaders/gbuf_pack.wgsl
index 71d8471..777b4e5 100644
--- a/cnn_v3/shaders/gbuf_pack.wgsl
+++ b/cnn_v3/shaders/gbuf_pack.wgsl
@@ -1,4 +1,5 @@
 // G-buffer pack compute shader for CNN v3
+#include "math/normal"
 // Pass 4: Pack all G-buffer channels into two rgba32uint feature textures (32 bytes/pixel)
 // Output feat_tex0 holds 8×f16 geometric channels; feat_tex1 holds 12×u8 context channels.
 
@@ -44,15 +45,6 @@ fn box4(tl: vec2i) -> vec3f {
     return acc * (1.0 / 16.0);
 }
 
-// Decode oct-normal from [0,1] storage → [-1,1] encoded xy → reconstruct z.
-fn decode_oct_normal(rg: vec2f) -> vec3f {
-    let f = rg * 2.0 - vec2f(1.0);
-    var n = vec3f(f.x, f.y, 1.0 - abs(f.x) - abs(f.y));
-    let t = max(-n.z, 0.0);
-    n.x += select(t, -t, n.x >= 0.0);
-    n.y += select(t, -t, n.y >= 0.0);
-    return normalize(n);
-}
 
 @compute @workgroup_size(8, 8)
 fn pack_features(@builtin(global_invocation_id) id: vec3u) {
@@ -73,7 +65,7 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
 
     // Normal: stored as oct-encoded [0,1] in RG; extract just the encoded xy for feat_tex0
     let normal_enc = nm.rg; // already in [0,1] — decode to get the xy for CNN input
-    let n3         = decode_oct_normal(normal_enc);
+    let n3         = oct_decode_unorm(normal_enc);
     // Store oct-encoded in [-1,1] remapped back to what CNN expects (the [-1,1] oct xy)
     let oct_xy     = normal_enc * 2.0 - vec2f(1.0); // remap [0,1] → [-1,1]
 
@@ -94,6 +86,9 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
     let mat_id_u8 = nm.b;                                       // mat_id already in [0,1]
     let shadow    = textureLoad(gbuf_shadow, coord, 0).r;
     let transp    = textureLoad(gbuf_transp, coord, 0).r;
+    let nor       = oct_decode_unorm(nm.rg);
+    let diffuse   = max(0.0, dot(nor, vec3f(0.408, 0.816, 0.408)));
+    let dif       = diffuse * shadow;
     let prev      = textureSampleLevel(prev_cnn, bilinear_sampler, uv, 0.0).rgb;
 
     // MIP 1: 2×2 box filter (half resolution context)
@@ -111,12 +106,12 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
     // Texture 1: 4 u32, each = pack4x8unorm of four u8 values
     // [0] mat_id | prev.r | prev.g | prev.b
     // [1] mip1.r | mip1.g | mip1.b | mip2.r
-    // [2] mip2.g | mip2.b | shadow | transp
-    // [3] spare  (0)
+    // [2] mip2.g | mip2.b | dif    | transp   — ch18=dif, ch19=transp
+    // [3] spare
     let t1 = vec4u(
         pack4x8unorm(vec4f(mat_id_u8, prev.r, prev.g, prev.b)),
         pack4x8unorm(vec4f(mip1.r, mip1.g, mip1.b, mip2.r)),
-        pack4x8unorm(vec4f(mip2.g, mip2.b, shadow, transp)),
+        pack4x8unorm(vec4f(mip2.g, mip2.b, dif, transp)),
         0u
     );
     textureStore(feat_tex1, coord, t1);
diff --git a/cnn_v3/shaders/gbuf_raster.wgsl b/cnn_v3/shaders/gbuf_raster.wgsl
index c762db2..aed0e72 100644
--- a/cnn_v3/shaders/gbuf_raster.wgsl
+++ b/cnn_v3/shaders/gbuf_raster.wgsl
@@ -1,8 +1,11 @@
 // G-buffer rasterization shader for CNN v3
 // Pass 1: Proxy geometry → MRT (albedo rgba16float, normal_mat rgba16float, depth32)
 // Uses GlobalUniforms, ObjectData, ObjectsBuffer from common_uniforms.
+// SPHERE objects use ray-sphere impostor (correct silhouette + normal + depth).
 
 #include "common_uniforms"
+#include "math/normal"
+#include "ray_sphere"
 
 @group(0) @binding(0) var<uniform> globals: GlobalUniforms;
 @group(0) @binding(1) var<storage, read> object_data: ObjectsBuffer;
@@ -15,21 +18,6 @@ struct VertexOutput {
     @location(3) @interpolate(flat) instance_index: u32,
 }
 
-// Octahedral encoding: maps unit normal to [-1,1]^2
-fn oct_encode(n: vec3f) -> vec2f {
-    let inv_l1 = 1.0 / (abs(n.x) + abs(n.y) + abs(n.z));
-    var p = n.xy * inv_l1;
-    // Fold lower hemisphere
-    if (n.z < 0.0) {
-        let s = vec2f(
-            select(-1.0, 1.0, p.x >= 0.0),
-            select(-1.0, 1.0, p.y >= 0.0)
-        );
-        p = (1.0 - abs(p.yx)) * s;
-    }
-    return p; // in [-1, 1]
-}
-
 @vertex
 fn vs_main(
     @builtin(vertex_index)   vertex_index:   u32,
@@ -73,8 +61,8 @@ fn vs_main(
 
     let world_pos    = obj.model * vec4f(p, 1.0);
     let clip_pos     = globals.view_proj * world_pos;
-    // Transform normal by inverse-transpose (upper-left 3×3 of inv_model^T)
-    let world_normal = normalize((obj.inv_model * vec4f(n, 0.0)).xyz);
+    // Transform normal: use model matrix (correct for uniform scale + rotation).
+    let world_normal = normalize((obj.model * vec4f(n, 0.0)).xyz);
 
     var out: VertexOutput;
     out.position       = clip_pos;
@@ -86,20 +74,40 @@ fn vs_main(
 }
 
 struct GBufOutput {
-    @location(0) albedo:     vec4f,  // rgba16float: material color
-    @location(1) normal_mat: vec4f,  // rgba16float: oct-normal XY in RG, mat_id/255 in B
+    @location(0)            albedo:     vec4f,  // rgba16float: material color
+    @location(1)            normal_mat: vec4f,  // rgba16float: oct-normal XY in RG, mat_id/255 in B
+    @builtin(frag_depth)    depth:      f32,    // corrected depth (sphere impostor)
 }
 
 @fragment
 fn fs_main(in: VertexOutput) -> GBufOutput {
-    let obj    = object_data.objects[in.instance_index];
-    let mat_id = f32(in.instance_index) / 255.0;
+    let obj      = object_data.objects[in.instance_index];
+    let obj_type = u32(obj.params.x);
+    let mat_id   = f32(in.instance_index) / 255.0;
+
+    var world_normal = normalize(in.world_normal);
+    var frag_depth   = in.position.z;  // default: hardware depth
+
+    // Sphere impostor: ray-sphere intersection for correct silhouette and normal.
+    if (obj_type == 1u) {
+        let sphere_center = obj.model[3].xyz;
+        let sphere_radius = length(obj.model[0].xyz);  // uniform scale in col0
+        let cam_pos = globals.camera_pos_time.xyz;
+        let rd = normalize(in.world_pos - cam_pos);
+        let isect = ray_sphere_intersection(cam_pos, rd, sphere_center, sphere_radius);
+        if (!isect.hit) { discard; }
+        let hit = cam_pos + rd * isect.t;
+        world_normal = normalize(hit - sphere_center);
+        // Reproject hit point to get correct clip-space depth.
+        let clip_hit = globals.view_proj * vec4f(hit, 1.0);
+        frag_depth   = clip_hit.z / clip_hit.w;
+    }
 
-    // Oct-encode world normal, remap [-1,1] → [0,1] for storage
-    let oct = oct_encode(normalize(in.world_normal)) * 0.5 + vec2f(0.5);
+    let oct = oct_encode_unorm(world_normal);
 
     var out: GBufOutput;
     out.albedo     = vec4f(in.color.rgb, 1.0);
     out.normal_mat = vec4f(oct.x, oct.y, mat_id, 0.0);
+    out.depth      = frag_depth;
     return out;
 }
diff --git a/cnn_v3/shaders/gbuf_shadow.wgsl b/cnn_v3/shaders/gbuf_shadow.wgsl
index 0f5f8b4..65ae1fa 100644
--- a/cnn_v3/shaders/gbuf_shadow.wgsl
+++ b/cnn_v3/shaders/gbuf_shadow.wgsl
@@ -5,11 +5,13 @@
 #include "common_uniforms"
 #include "camera_common"
 #include "math/sdf_shapes"
+#include "math/normal"
 #include "render/raymarching_id"
 
 @group(0) @binding(0) var<uniform>         globals:     GlobalUniforms;
 @group(0) @binding(1) var<storage, read>   object_data: ObjectsBuffer;
 @group(0) @binding(2) var                  depth_tex:   texture_depth_2d;
+@group(0) @binding(4) var                  normal_mat_tex: texture_2d<f32>;
 
 struct GBufLight {
     direction: vec4f,  // xyz = toward light (world space, normalized)
@@ -26,7 +28,9 @@ struct GBufLightsUniforms {
 // Stub required by render/raymarching (shadow() / rayMarch() call df()).
 fn df(p: vec3f) -> f32 { return MAX_RAY_LENGTH; }
 
-// SDF of the full scene: proxy box for each object transformed to local space.
+// SDF of the full scene.
+// Sphere: direct world-space formula (exact, no matrix multiply).
+// Box/Torus/Plane: local-space transform + uniform-scale correction.
 fn dfWithID(p: vec3f) -> RayMarchResult {
     var res: RayMarchResult;
     res.distance     = MAX_RAY_LENGTH;
@@ -36,14 +40,32 @@ fn dfWithID(p: vec3f) -> RayMarchResult {
     let n = u32(globals.params.x);
     for (var i = 0u; i < n; i++) {
         let obj      = object_data.objects[i];
-        let lp       = (obj.inv_model * vec4f(p, 1.0)).xyz;
         let obj_type = u32(obj.params.x);
         var d: f32;
         switch obj_type {
-            case 1u: { d = sdSphere(lp, 1.0); }                              // SPHERE
-            case 2u: { d = sdPlane(lp, vec3f(0.0, 1.0, 0.0), obj.params.y); } // PLANE
-            case 3u: { d = sdTorus(lp, vec2f(0.8, 0.2)); }                   // TORUS
-            default: { d = sdBox(lp, vec3f(1.0)); }                          // CUBE (0) + fallback
+            case 1u: {
+                // SPHERE: direct world-space SDF — avoids matrix multiply, exact.
+                let c = obj.model[3].xyz;
+                let r = length(obj.model[0].xyz);
+                d = length(p - c) - r;
+            }
+            case 2u: {
+                // PLANE
+                let lp = (obj.inv_model * vec4f(p, 1.0)).xyz;
+                d = sdPlane(lp, vec3f(0.0, 1.0, 0.0), obj.params.y);
+            }
+            case 3u: {
+                // TORUS
+                let lp    = (obj.inv_model * vec4f(p, 1.0)).xyz;
+                let scale = length(obj.model[0].xyz);
+                d = sdTorus(lp, vec2f(0.8, 0.2)) * scale;
+            }
+            default: {
+                // CUBE (0) + fallback — uniform scale assumed.
+                let lp    = (obj.inv_model * vec4f(p, 1.0)).xyz;
+                let scale = length(obj.model[0].xyz);
+                d = sdBox(lp, vec3f(1.0)) * scale;
+            }
         }
         if (d < res.distance) {
             res.distance  = d;
@@ -53,6 +75,20 @@ fn dfWithID(p: vec3f) -> RayMarchResult {
     return res;
 }
 
+// Soft shadow march (IQ formula). Returns 1=lit, 0=shadow.
+// No dmin/dmax bounds: in open space d grows large so 8*d/t >> 1, res stays 1 naturally.
+fn soft_shadow(ro: vec3f, rd: vec3f) -> f32 {
+    var t   = 0.001;
+    var res = 1.0;
+    for (var i = 0; i < 64; i++) {
+        let d = dfWithID(ro + rd * t).distance;
+        if (d < 0.0005) { return 0.0; }
+        res = min(res, 8.0 * d / t);
+        t  += d;
+    }
+    return clamp(res, 0.0, 1.0);
+}
+
 // ---- Vertex: fullscreen triangle ----
 
 @vertex
@@ -82,8 +118,9 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
     let clip  = globals.inv_view_proj * vec4f(ndc, depth, 1.0);
     let world = clip.xyz / clip.w;
 
-    // Surface normal estimated from SDF gradient.
-    let nor      = normalWithID(world);
+    // Use rasterized surface normal for bias — correct for sphere impostors.
+    let nm       = textureLoad(normal_mat_tex, vec2i(pos.xy), 0);
+    let nor      = oct_decode_unorm(nm.rg);
     let bias_pos = world + nor * 0.02;
 
     // March shadow rays toward each light; take the darkest value.
@@ -91,7 +128,7 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
     let num_lights  = u32(lights.params.x);
     for (var i = 0u; i < num_lights; i++) {
         let ld = lights.lights[i].direction.xyz;
-        let s  = shadowWithStoredDistance(bias_pos, ld, MAX_RAY_LENGTH);
+        let s  = soft_shadow(bias_pos, ld);
         shadow_val = min(shadow_val, s);
     }
 
diff --git a/cnn_v3/shaders/gbuf_view.wgsl b/cnn_v3/shaders/gbuf_view.wgsl
index a5e6c91..6a812e6 100644
--- a/cnn_v3/shaders/gbuf_view.wgsl
+++ b/cnn_v3/shaders/gbuf_view.wgsl
@@ -7,7 +7,9 @@
 //   Row 1: ch4(nrm.y)  ch5(depth)  ch6(dzdx)   ch7(dzdy)
 //   Row 2: ch8(matid)  ch9(prv.r)  ch10(prv.g) ch11(prv.b)
 //   Row 3: ch12(m1.r)  ch13(m1.g)  ch14(m1.b)  ch15(m2.r)
-//   Row 4: ch16(m2.g)  ch17(m2.b)  ch18(shdw)  ch19(trns)
+//   Row 4: ch16(m2.g)  ch17(m2.b)  ch18(dif)   ch19(trns)
+
+#include "debug/debug_print"
 
 struct GBufViewUniforms { resolution: vec2f }
 
@@ -93,5 +95,33 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
         disp = clamp(v, 0.0, 1.0);
     }
 
-    return vec4f(disp, disp, disp, 1.0);
+    var out = vec4f(disp, disp, disp, 1.0);
+
+    // Label at top-left of each tile
+    let tile_w = u.resolution.x / 4.0;
+    let tile_h = u.resolution.y / 5.0;
+    let origin = vec2f(f32(col) * tile_w + 4.0, f32(row) * tile_h + 4.0);
+    switch ch {
+        case  0u: { out = debug_str(out, pos.xy, origin, vec4u(0x616C622Eu, 0x72000000u, 0u, 0u), 5u); } // alb.r
+        case  1u: { out = debug_str(out, pos.xy, origin, vec4u(0x616C622Eu, 0x67000000u, 0u, 0u), 5u); } // alb.g
+        case  2u: { out = debug_str(out, pos.xy, origin, vec4u(0x616C622Eu, 0x62000000u, 0u, 0u), 5u); } // alb.b
+        case  3u: { out = debug_str(out, pos.xy, origin, vec4u(0x6E726D2Eu, 0x78000000u, 0u, 0u), 5u); } // nrm.x
+        case  4u: { out = debug_str(out, pos.xy, origin, vec4u(0x6E726D2Eu, 0x79000000u, 0u, 0u), 5u); } // nrm.y
+        case  5u: { out = debug_str(out, pos.xy, origin, vec4u(0x64657074u, 0x68000000u, 0u, 0u), 5u); } // depth
+        case  6u: { out = debug_str(out, pos.xy, origin, vec4u(0x647A6478u, 0u, 0u, 0u), 4u); }          // dzdx
+        case  7u: { out = debug_str(out, pos.xy, origin, vec4u(0x647A6479u, 0u, 0u, 0u), 4u); }          // dzdy
+        case  8u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D617469u, 0x64000000u, 0u, 0u), 5u); } // matid
+        case  9u: { out = debug_str(out, pos.xy, origin, vec4u(0x7072762Eu, 0x72000000u, 0u, 0u), 5u); } // prv.r
+        case 10u: { out = debug_str(out, pos.xy, origin, vec4u(0x7072762Eu, 0x67000000u, 0u, 0u), 5u); } // prv.g
+        case 11u: { out = debug_str(out, pos.xy, origin, vec4u(0x7072762Eu, 0x62000000u, 0u, 0u), 5u); } // prv.b
+        case 12u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D312E72u, 0u, 0u, 0u), 4u); }          // m1.r
+        case 13u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D312E67u, 0u, 0u, 0u), 4u); }          // m1.g
+        case 14u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D312E62u, 0u, 0u, 0u), 4u); }          // m1.b
+        case 15u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E72u, 0u, 0u, 0u), 4u); }          // m2.r
+        case 16u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E67u, 0u, 0u, 0u), 4u); }          // m2.g
+        case 17u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E62u, 0u, 0u, 0u), 4u); }          // m2.b
+        case 18u: { out = debug_str(out, pos.xy, origin, vec4u(0x64696600u, 0u, 0u, 0u), 3u); }          // dif
+        default:  { out = debug_str(out, pos.xy, origin, vec4u(0x74726E73u, 0u, 0u, 0u), 4u); }          // trns
+    }
+    return out;
 }
diff --git a/cnn_v3/src/gbuf_deferred_effect.cc b/cnn_v3/src/gbuf_deferred_effect.cc
new file mode 100644
index 0000000..de6bd29
--- /dev/null
+++ b/cnn_v3/src/gbuf_deferred_effect.cc
@@ -0,0 +1,140 @@
+// GBufDeferredEffect — simple deferred render: albedo * shadow from packed G-buffer.
+
+#include "gbuf_deferred_effect.h"
+#include "gpu/gpu.h"
+#include "gpu/shader_composer.h"
+#include "util/fatal_error.h"
+
+extern const char* gbuf_deferred_wgsl;
+
+struct GBufDeferredUniforms {
+  float resolution[2];
+};
+static_assert(sizeof(GBufDeferredUniforms) == 8, "GBufDeferredUniforms must be 8 bytes");
+
+static WGPUBindGroupLayoutEntry bgl_uint_tex(uint32_t binding) {
+  WGPUBindGroupLayoutEntry e = {};
+  e.binding               = binding;
+  e.visibility            = WGPUShaderStage_Fragment;
+  e.texture.sampleType    = WGPUTextureSampleType_Uint;
+  e.texture.viewDimension = WGPUTextureViewDimension_2D;
+  return e;
+}
+
+static WGPUBindGroupLayoutEntry bgl_uniform(uint32_t binding, uint64_t min_size) {
+  WGPUBindGroupLayoutEntry e = {};
+  e.binding               = binding;
+  e.visibility            = WGPUShaderStage_Fragment;
+  e.buffer.type           = WGPUBufferBindingType_Uniform;
+  e.buffer.minBindingSize = min_size;
+  return e;
+}
+
+GBufDeferredEffect::GBufDeferredEffect(const GpuContext& ctx,
+                                       const std::vector<std::string>& inputs,
+                                       const std::vector<std::string>& outputs,
+                                       float start_time, float end_time)
+    : Effect(ctx, inputs, outputs, start_time, end_time) {
+  HEADLESS_RETURN_IF_NULL(ctx_.device);
+
+  WGPUBindGroupLayoutEntry entries[3] = {
+      bgl_uint_tex(0),
+      bgl_uint_tex(1),
+      bgl_uniform(2, sizeof(GBufDeferredUniforms)),
+  };
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 3;
+  bgl_desc.entries    = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(ctx_.device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts     = &bgl;
+  WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(ctx_.device, &pl_desc);
+
+  WGPUShaderSourceWGSL wgsl_src = {};
+  wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
+  const std::string composed = ShaderComposer::Get().Compose({}, gbuf_deferred_wgsl);
+  wgsl_src.code        = str_view(composed.c_str());
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_src.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(ctx_.device, &shader_desc);
+
+  WGPUColorTargetState target = {};
+  target.format    = WGPUTextureFormat_RGBA8Unorm;
+  target.writeMask = WGPUColorWriteMask_All;
+
+  WGPUFragmentState frag = {};
+  frag.module      = shader;
+  frag.entryPoint  = str_view("fs_main");
+  frag.targetCount = 1;
+  frag.targets     = &target;
+
+  WGPURenderPipelineDescriptor pipe_desc = {};
+  pipe_desc.layout               = pl;
+  pipe_desc.vertex.module        = shader;
+  pipe_desc.vertex.entryPoint    = str_view("vs_main");
+  pipe_desc.fragment             = &frag;
+  pipe_desc.primitive.topology   = WGPUPrimitiveTopology_TriangleList;
+  pipe_desc.multisample.count    = 1;
+  pipe_desc.multisample.mask     = UINT32_MAX;
+
+  pipeline_.set(wgpuDeviceCreateRenderPipeline(ctx_.device, &pipe_desc));
+
+  wgpuShaderModuleRelease(shader);
+  wgpuPipelineLayoutRelease(pl);
+  wgpuBindGroupLayoutRelease(bgl);
+}
+
+void GBufDeferredEffect::render(WGPUCommandEncoder encoder,
+                                const UniformsSequenceParams& params,
+                                NodeRegistry& nodes) {
+  WGPUTextureView feat0_view  = nodes.get_view(input_nodes_[0]);
+  WGPUTextureView feat1_view  = nodes.get_view(input_nodes_[1]);
+  WGPUTextureView output_view = nodes.get_view(output_nodes_[0]);
+
+  // Upload resolution uniform into the base class uniforms buffer (first 8 bytes).
+  GBufDeferredUniforms u;
+  u.resolution[0] = params.resolution.x;
+  u.resolution[1] = params.resolution.y;
+  wgpuQueueWriteBuffer(ctx_.queue, uniforms_buffer_.get().buffer, 0,
+                       &u, sizeof(u));
+
+  WGPUBindGroupLayout bgl =
+      wgpuRenderPipelineGetBindGroupLayout(pipeline_.get(), 0);
+
+  WGPUBindGroupEntry bg_entries[3] = {};
+  bg_entries[0].binding     = 0;
+  bg_entries[0].textureView = feat0_view;
+  bg_entries[1].binding     = 1;
+  bg_entries[1].textureView = feat1_view;
+  bg_entries[2].binding     = 2;
+  bg_entries[2].buffer      = uniforms_buffer_.get().buffer;
+  bg_entries[2].size        = sizeof(GBufDeferredUniforms);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout     = bgl;
+  bg_desc.entryCount = 3;
+  bg_desc.entries    = bg_entries;
+  bind_group_.replace(wgpuDeviceCreateBindGroup(ctx_.device, &bg_desc));
+  wgpuBindGroupLayoutRelease(bgl);
+
+  WGPURenderPassColorAttachment color_att = {};
+  color_att.view       = output_view;
+  color_att.loadOp     = WGPULoadOp_Clear;
+  color_att.storeOp    = WGPUStoreOp_Store;
+  color_att.clearValue = {0.0f, 0.0f, 0.0f, 1.0f};
+  color_att.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
+
+  WGPURenderPassDescriptor pass_desc = {};
+  pass_desc.colorAttachmentCount = 1;
+  pass_desc.colorAttachments     = &color_att;
+
+  WGPURenderPassEncoder pass =
+      wgpuCommandEncoderBeginRenderPass(encoder, &pass_desc);
+  wgpuRenderPassEncoderSetPipeline(pass, pipeline_.get());
+  wgpuRenderPassEncoderSetBindGroup(pass, 0, bind_group_.get(), 0, nullptr);
+  wgpuRenderPassEncoderDraw(pass, 3, 1, 0, 0);
+  wgpuRenderPassEncoderEnd(pass);
+  wgpuRenderPassEncoderRelease(pass);
+}
diff --git a/cnn_v3/src/gbuf_deferred_effect.h b/cnn_v3/src/gbuf_deferred_effect.h
new file mode 100644
index 0000000..4daf13d
--- /dev/null
+++ b/cnn_v3/src/gbuf_deferred_effect.h
@@ -0,0 +1,23 @@
+// GBufDeferredEffect — simple deferred render from packed G-buffer.
+// Inputs: feat_tex0, feat_tex1 (rgba32uint). Output: albedo * shadow (rgba8unorm).
+
+#pragma once
+#include "gpu/effect.h"
+#include "gpu/uniform_helper.h"
+#include "gpu/wgpu_resource.h"
+
+class GBufDeferredEffect : public Effect {
+ public:
+  GBufDeferredEffect(const GpuContext& ctx,
+                     const std::vector<std::string>& inputs,
+                     const std::vector<std::string>& outputs,
+                     float start_time, float end_time);
+
+  void render(WGPUCommandEncoder encoder,
+              const UniformsSequenceParams& params,
+              NodeRegistry& nodes) override;
+
+ private:
+  RenderPipeline pipeline_;
+  BindGroup bind_group_;
+};
diff --git a/cnn_v3/src/gbuf_view_effect.cc b/cnn_v3/src/gbuf_view_effect.cc
index 180919d..ccf80b0 100644
--- a/cnn_v3/src/gbuf_view_effect.cc
+++ b/cnn_v3/src/gbuf_view_effect.cc
@@ -10,6 +10,7 @@
 #endif
 
 #include "gpu/gpu.h"
+#include "gpu/shader_composer.h"
 #include "util/asset_manager.h"
 #include "util/fatal_error.h"
 
@@ -63,7 +64,8 @@ GBufViewEffect::GBufViewEffect(const GpuContext& ctx,
   // Shader module
   WGPUShaderSourceWGSL wgsl_src = {};
   wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
-  wgsl_src.code        = str_view(gbuf_view_wgsl);
+  const std::string composed = ShaderComposer::Get().Compose({}, gbuf_view_wgsl);
+  wgsl_src.code        = str_view(composed.c_str());
   WGPUShaderModuleDescriptor shader_desc = {};
   shader_desc.nextInChain = &wgsl_src.chain;
   WGPUShaderModule shader =
diff --git a/cnn_v3/src/gbuffer_effect.cc b/cnn_v3/src/gbuffer_effect.cc
index 89ed8fc..6815154 100644
--- a/cnn_v3/src/gbuffer_effect.cc
+++ b/cnn_v3/src/gbuffer_effect.cc
@@ -30,6 +30,9 @@ struct GBufObjectData {
 static_assert(sizeof(GBufObjectData) == sizeof(float) * 40,
               "GBufObjectData must be 160 bytes");
 
+// Reusable CPU staging buffer (single-threaded demo — no concurrency needed).
+static GBufObjectData s_obj_staging[kGBufMaxObjects];
+
 // GlobalUniforms struct mirroring renderer.h
 struct GBufGlobalUniforms {
   mat4 view_proj;
@@ -42,18 +45,6 @@ struct GBufGlobalUniforms {
 static_assert(sizeof(GBufGlobalUniforms) == sizeof(float) * 44,
               "GBufGlobalUniforms must be 176 bytes");
 
-// Create bilinear sampler.
-static WGPUSampler create_bilinear_sampler(WGPUDevice device) {
-  WGPUSamplerDescriptor desc = {};
-  desc.addressModeU = WGPUAddressMode_ClampToEdge;
-  desc.addressModeV = WGPUAddressMode_ClampToEdge;
-  desc.magFilter = WGPUFilterMode_Linear;
-  desc.minFilter = WGPUFilterMode_Linear;
-  desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
-  desc.maxAnisotropy = 1;
-  return wgpuDeviceCreateSampler(device, &desc);
-}
-
 // ---- GBufferEffect ----
 
 GBufferEffect::GBufferEffect(const GpuContext& ctx,
@@ -70,9 +61,7 @@ GBufferEffect::GBufferEffect(const GpuContext& ctx,
   node_depth_      = prefix + "_depth";
   node_shadow_     = prefix + "_shadow";
   node_transp_     = prefix + "_transp";
-  node_feat0_      = outputs.size() > 0 ? outputs[0] : prefix + "_feat0";
-  node_feat1_      = outputs.size() > 1 ? outputs[1] : prefix + "_feat1";
-
+  node_prev_tex_   = prefix + "_prev";
   // Allocate GPU buffers for scene data.
   global_uniforms_buf_ =
       gpu_create_buffer(ctx_.device, sizeof(GBufGlobalUniforms),
@@ -89,6 +78,8 @@ GBufferEffect::GBufferEffect(const GpuContext& ctx,
   create_raster_pipeline();
   create_shadow_pipeline();
   create_pack_pipeline();
+
+  set_scene();
 }
 
 void GBufferEffect::declare_nodes(NodeRegistry& registry) {
@@ -99,12 +90,14 @@ void GBufferEffect::declare_nodes(NodeRegistry& registry) {
   registry.declare_node(node_transp_,     NodeType::GBUF_R8,        -1, -1);
   // feat_tex0 / feat_tex1 are the declared output_nodes_ — they get registered
   // by the sequence infrastructure; declare them here as well if not already.
-  if (!registry.has_node(node_feat0_)) {
-    registry.declare_node(node_feat0_, NodeType::GBUF_RGBA32UINT, -1, -1);
+  if (!registry.has_node(output_nodes_[0])) {
+    registry.declare_node(output_nodes_[0], NodeType::GBUF_RGBA32UINT, -1, -1);
   }
-  if (!registry.has_node(node_feat1_)) {
-    registry.declare_node(node_feat1_, NodeType::GBUF_RGBA32UINT, -1, -1);
+  if (!registry.has_node(output_nodes_[1])) {
+    registry.declare_node(output_nodes_[1], NodeType::GBUF_RGBA32UINT, -1, -1);
   }
+  // F16X8 = Rgba16Float with CopySrc|CopyDst — matches CNNv3Effect output format.
+  registry.declare_node(node_prev_tex_, NodeType::F16X8, -1, -1);
 }
 
 void GBufferEffect::set_scene() {
@@ -122,45 +115,34 @@ void GBufferEffect::set_scene() {
   };
   auto rrange = [&](float lo, float hi) { return lo + rnd() * (hi - lo); };
 
-  // 20 small cubes scattered in a [-2,2]×[-1.5,1.5]×[-1.5,1.5] volume.
-  static const int kNumCubes = 20;
-  for (int i = 0; i < kNumCubes; ++i) {
+  // 2 large cubes.
+  // 2 large static cubes for shadow debugging.
+  {
     Object3D obj(ObjectType::CUBE);
-    obj.position = vec3(rrange(-2.0f, 2.0f),
-                        rrange(-1.5f, 1.5f),
-                        rrange(-1.5f, 1.5f));
-    const float s = rrange(0.10f, 0.25f);
-    obj.scale = vec3(s, s, s);
-    obj.color = vec4(rrange(0.4f, 1.0f),
-                     rrange(0.4f, 1.0f),
-                     rrange(0.4f, 1.0f), 1.0f);
-
-    // Random rotation axis (avoid degenerate zero-length axis).
-    vec3 axis = vec3(rrange(-1.0f, 1.0f),
-                     rrange(-1.0f, 1.0f),
-                     rrange(-1.0f, 1.0f));
-    if (axis.len() < 0.01f) axis = vec3(0.0f, 1.0f, 0.0f);
-    axis = axis.normalize();
-    const float speed = rrange(0.3f, 1.5f) * (rnd() > 0.5f ? 1.0f : -1.0f);
-
+    obj.position = vec3(1.0f, 0.0f, 0.0f);
+    obj.scale    = vec3(0.6f, 0.6f, 0.6f);
+    obj.color    = vec4(0.9f, 0.5f, 0.3f, 1.0f);
+    scene_.add_object(obj);
+    cube_anims_.push_back({{0.0f, 1.0f, 0.0f}, 0.0f});
+  }
+  {
+    Object3D obj(ObjectType::SPHERE);
+    obj.position = vec3(-1.0f, 0.0f, 0.0f);
+    const float r = 0.9f;
+    obj.scale    = vec3(r, r, r);
+    obj.color    = vec4(0.3f, 0.6f, 0.9f, 1.0f);
+    const int idx = (int)scene_.objects.size();
     scene_.add_object(obj);
-    cube_anims_.push_back({axis, speed});
+    sphere_anims_.push_back({idx, r});
   }
 
-  // 4 pumping spheres at fixed positions; radius modulated by audio_intensity.
-  static const vec3 kSpherePos[4] = {
-    { 0.0f,  0.0f,  0.0f},
-    { 1.5f,  0.5f, -0.5f},
-    {-1.5f, -0.5f,  0.5f},
-    { 0.0f,  1.0f,  1.0f},
-  };
-  static const float kBaseSphereRadius[4] = {0.35f, 0.28f, 0.30f, 0.25f};
-  for (int i = 0; i < 4; ++i) {
+  // Second sphere: small, hovering above both objects, pulsating.
+  {
     Object3D obj(ObjectType::SPHERE);
-    obj.position = kSpherePos[i];
-    const float r = kBaseSphereRadius[i];
-    obj.scale = vec3(r, r, r);
-    obj.color = vec4(0.85f, 0.60f, 0.95f, 1.0f);
+    obj.position = vec3(0.0f, 2.2f, 0.0f);
+    const float r = 0.6f;
+    obj.scale    = vec3(r, r, r);
+    obj.color    = vec4(0.9f, 0.8f, 0.2f, 1.0f);
     const int idx = (int)scene_.objects.size();
     scene_.add_object(obj);
     sphere_anims_.push_back({idx, r});
@@ -178,6 +160,22 @@ void GBufferEffect::set_scene() {
   scene_ready_ = true;
 }
 
+static void clear_r8_node(WGPUCommandEncoder encoder, WGPUTextureView view,
+                           float value) {
+  WGPURenderPassColorAttachment att = {};
+  att.view       = view;
+  att.loadOp     = WGPULoadOp_Clear;
+  att.storeOp    = WGPUStoreOp_Store;
+  att.clearValue = {value, value, value, value};
+  att.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
+  WGPURenderPassDescriptor pd = {};
+  pd.colorAttachmentCount = 1;
+  pd.colorAttachments     = &att;
+  WGPURenderPassEncoder p = wgpuCommandEncoderBeginRenderPass(encoder, &pd);
+  wgpuRenderPassEncoderEnd(p);
+  wgpuRenderPassEncoderRelease(p);
+}
+
 void GBufferEffect::render(WGPUCommandEncoder encoder,
                            const UniformsSequenceParams& params,
                            NodeRegistry& nodes) {
@@ -188,6 +186,13 @@ void GBufferEffect::render(WGPUCommandEncoder encoder,
   // Update camera aspect ratio from current resolution.
   camera_.aspect_ratio = params.aspect_ratio;
 
+  // Slowly orbit around the scene.
+  const float angle = params.time * 0.3f;
+  const float R = 6.0f;
+  camera_.set_look_at(vec3(R * sinf(angle), 2.5f, R * cosf(angle)),
+                      vec3(0.0f, 0.0f, 0.0f),
+                      vec3(0.0f, 1.0f, 0.0f));
+
   // Animate cubes: axis-angle rotation driven by physical time.
   for (int i = 0; i < (int)cube_anims_.size(); ++i) {
     const CubeAnim& a = cube_anims_[(size_t)i];
@@ -203,13 +208,13 @@ void GBufferEffect::render(WGPUCommandEncoder encoder,
   // Upload two directional lights.
   {
     GBufLightsUniforms lu = {};
-    lu.params = vec4(2.0f, 0.0f, 0.0f, 0.0f);
+    lu.params = vec4(1.0f, 0.0f, 0.0f, 0.0f);
     // Key: warm sun, upper-right-front.
     lu.lights[0].direction = vec4(0.408f, 0.816f, 0.408f, 0.0f);  // norm(1,2,1)
     lu.lights[0].color     = vec4(1.00f, 0.92f, 0.78f, 1.0f);
-    // Fill: cool sky, upper-left-back.
-    lu.lights[1].direction = vec4(-0.577f, 0.577f, -0.577f, 0.0f);  // norm(-1,1,-1)
-    lu.lights[1].color     = vec4(0.40f, 0.45f, 0.80f, 0.4f);
+    // Fill: cool sky, upper-left-back. (disabled for debugging)
+    // lu.lights[1].direction = vec4(-0.577f, 0.577f, -0.577f, 0.0f);
+    // lu.lights[1].color     = vec4(0.40f, 0.45f, 0.80f, 0.4f);
     lights_uniform_.update(ctx_.queue, lu);
   }
 
@@ -225,17 +230,12 @@ void GBufferEffect::render(WGPUCommandEncoder encoder,
   WGPUTextureView albedo_view     = nodes.get_view(node_albedo_);
   WGPUTextureView normal_mat_view = nodes.get_view(node_normal_mat_);
   WGPUTextureView depth_view      = nodes.get_view(node_depth_);
-  WGPUTextureView feat0_view      = nodes.get_view(node_feat0_);
-  WGPUTextureView feat1_view      = nodes.get_view(node_feat1_);
+  WGPUTextureView feat0_view      = nodes.get_view(output_nodes_[0]);
+  WGPUTextureView feat1_view      = nodes.get_view(output_nodes_[1]);
 
-  // prev_cnn: first input node if available, else dummy.
-  WGPUTextureView prev_view = nullptr;
-  if (!input_nodes_.empty()) {
-    prev_view = nodes.get_view(input_nodes_[0]);
-  }
-  if (!prev_view) {
-    prev_view = dummy_texture_view_.get();
-  }
+  // node_prev_tex_ is updated by post_render() at the end of each frame.
+  // On frame 0 it is zero (NodeRegistry zeroes new textures) — correct default.
+  WGPUTextureView prev_view = nodes.get_view(node_prev_tex_);
 
   // --- Pass 1: MRT rasterization ---
   update_raster_bind_group(nodes);
@@ -291,7 +291,7 @@ void GBufferEffect::render(WGPUCommandEncoder encoder,
 
   // --- Pass 2: SDF shadow raymarching ---
   if (shadow_pipeline_.get() != nullptr) {
-    WGPUBindGroupEntry shadow_entries[4] = {};
+    WGPUBindGroupEntry shadow_entries[5] = {};
     shadow_entries[0].binding = 0;
     shadow_entries[0].buffer  = global_uniforms_buf_.buffer;
     shadow_entries[0].size    = sizeof(GBufGlobalUniforms);
@@ -307,12 +307,15 @@ void GBufferEffect::render(WGPUCommandEncoder encoder,
     shadow_entries[3].buffer  = lights_uniform_.get().buffer;
     shadow_entries[3].size    = sizeof(GBufLightsUniforms);
 
+    shadow_entries[4].binding     = 4;
+    shadow_entries[4].textureView = normal_mat_view;
+
     WGPUBindGroupLayout shadow_bgl =
         wgpuRenderPipelineGetBindGroupLayout(shadow_pipeline_.get(), 0);
 
     WGPUBindGroupDescriptor shadow_bg_desc = {};
     shadow_bg_desc.layout     = shadow_bgl;
-    shadow_bg_desc.entryCount = 4;
+    shadow_bg_desc.entryCount = 5;
     shadow_bg_desc.entries    = shadow_entries;
 
     WGPUBindGroup shadow_bg =
@@ -340,36 +343,11 @@ void GBufferEffect::render(WGPUCommandEncoder encoder,
     wgpuBindGroupRelease(shadow_bg);
   } else {
     // Fallback: clear to 1.0 (fully lit) if pipeline not ready.
-    WGPURenderPassColorAttachment att = {};
-    att.view       = nodes.get_view(node_shadow_);
-    att.loadOp     = WGPULoadOp_Clear;
-    att.storeOp    = WGPUStoreOp_Store;
-    att.clearValue = {1.0f, 1.0f, 1.0f, 1.0f};
-    att.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
-    WGPURenderPassDescriptor pd = {};
-    pd.colorAttachmentCount = 1;
-    pd.colorAttachments     = &att;
-    WGPURenderPassEncoder p = wgpuCommandEncoderBeginRenderPass(encoder, &pd);
-    wgpuRenderPassEncoderEnd(p);
-    wgpuRenderPassEncoderRelease(p);
+    clear_r8_node(encoder, nodes.get_view(node_shadow_), 1.0f);
   }
 
   // Pass 3: Transparency — TODO (deferred; opaque scenes only)
-  // Clear transp node to 0.0 (fully opaque) until pass 3 is implemented.
-  {
-    WGPURenderPassColorAttachment att = {};
-    att.view       = nodes.get_view(node_transp_);
-    att.loadOp     = WGPULoadOp_Clear;
-    att.storeOp    = WGPUStoreOp_Store;
-    att.clearValue = {0.0f, 0.0f, 0.0f, 0.0f};
-    att.depthSlice = WGPU_DEPTH_SLICE_UNDEFINED;
-    WGPURenderPassDescriptor pd = {};
-    pd.colorAttachmentCount = 1;
-    pd.colorAttachments     = &att;
-    WGPURenderPassEncoder p = wgpuCommandEncoderBeginRenderPass(encoder, &pd);
-    wgpuRenderPassEncoderEnd(p);
-    wgpuRenderPassEncoderRelease(p);
-  }
+  clear_r8_node(encoder, nodes.get_view(node_transp_), 0.0f);
 
   // --- Pass 4: Pack compute ---
   // Rebuild pack bind group with current node views.
@@ -463,7 +441,8 @@ void GBufferEffect::upload_scene_data(const Scene& scene,
                 : (size_t)kGBufMaxObjects);
 
   const mat4 view = camera.get_view_matrix();
-  const mat4 proj = camera.get_projection_matrix();
+  mat4 proj = camera.get_projection_matrix();
+  proj.m[5] = -proj.m[5];  // undo post-process Y flip: G-buffer uses integer reads
   const mat4 vp   = proj * view;
 
   GBufGlobalUniforms gu = {};
@@ -478,23 +457,19 @@ void GBufferEffect::upload_scene_data(const Scene& scene,
   wgpuQueueWriteBuffer(ctx_.queue, global_uniforms_buf_.buffer, 0,
                        &gu, sizeof(GBufGlobalUniforms));
 
-  // Upload object data.
+  // Upload object data (no per-frame heap alloc — reuse s_obj_staging).
   if (num_objects > 0) {
     ensure_objects_buffer(num_objects);
-    std::vector<GBufObjectData> obj_data;
-    obj_data.reserve((size_t)num_objects);
     for (int i = 0; i < num_objects; ++i) {
       const Object3D& obj = scene.objects[(size_t)i];
       const mat4 m = obj.get_model_matrix();
-      GBufObjectData d;
-      d.model     = m;
-      d.inv_model = m.inverse();
-      d.color     = obj.color;
-      d.params    = vec4((float)(int)obj.type, 0.0f, 0.0f, 0.0f);
-      obj_data.push_back(d);
+      s_obj_staging[i].model     = m;
+      s_obj_staging[i].inv_model = m.inverse();
+      s_obj_staging[i].color     = obj.color;
+      s_obj_staging[i].params    = vec4((float)(int)obj.type, 0.0f, 0.0f, 0.0f);
     }
     wgpuQueueWriteBuffer(ctx_.queue, objects_buf_.buffer, 0,
-                         obj_data.data(),
+                         s_obj_staging,
                          (size_t)num_objects * sizeof(GBufObjectData));
   }
 }
@@ -509,7 +484,7 @@ void GBufferEffect::create_raster_pipeline() {
   }
 
   const std::string composed =
-      ShaderComposer::Get().Compose({"common_uniforms"}, src);
+      ShaderComposer::Get().Compose({}, src);
 
   WGPUShaderSourceWGSL wgsl_src = {};
   wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
@@ -567,8 +542,9 @@ void GBufferEffect::create_raster_pipeline() {
   pipe_desc.vertex.entryPoint = str_view("vs_main");
   pipe_desc.fragment = &frag;
   pipe_desc.depthStencil = &ds;
-  pipe_desc.primitive.topology = WGPUPrimitiveTopology_TriangleList;
-  pipe_desc.primitive.cullMode = WGPUCullMode_Back;
+  pipe_desc.primitive.topology  = WGPUPrimitiveTopology_TriangleList;
+  pipe_desc.primitive.cullMode  = WGPUCullMode_Back;
+  pipe_desc.primitive.frontFace = WGPUFrontFace_CCW;  // standard (no Y flip)
   pipe_desc.multisample.count = 1;
   pipe_desc.multisample.mask = 0xFFFFFFFF;
 
@@ -598,7 +574,7 @@ void GBufferEffect::create_shadow_pipeline() {
   WGPUShaderModule shader = wgpuDeviceCreateShaderModule(ctx_.device, &shader_desc);
 
   // BGL: B0=GlobalUniforms, B1=ObjectsBuffer, B2=texture_depth_2d, B3=GBufLightsUniforms
-  WGPUBindGroupLayoutEntry bgl_entries[4] = {};
+  WGPUBindGroupLayoutEntry bgl_entries[5] = {};
 
   bgl_entries[0].binding = 0;
   bgl_entries[0].visibility =
@@ -621,8 +597,13 @@ void GBufferEffect::create_shadow_pipeline() {
   bgl_entries[3].buffer.type = WGPUBufferBindingType_Uniform;
   bgl_entries[3].buffer.minBindingSize = sizeof(GBufLightsUniforms);
 
+  bgl_entries[4].binding = 4;
+  bgl_entries[4].visibility = WGPUShaderStage_Fragment;
+  bgl_entries[4].texture.sampleType = WGPUTextureSampleType_Float;
+  bgl_entries[4].texture.viewDimension = WGPUTextureViewDimension_2D;
+
   WGPUBindGroupLayoutDescriptor bgl_desc = {};
-  bgl_desc.entryCount = 4;
+  bgl_desc.entryCount = 5;
   bgl_desc.entries = bgl_entries;
   WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(ctx_.device, &bgl_desc);
 
@@ -793,7 +774,23 @@ void GBufferEffect::update_raster_bind_group(NodeRegistry& nodes) {
   wgpuBindGroupLayoutRelease(bgl);
 }
 
-void GBufferEffect::update_pack_bind_group(NodeRegistry& nodes) {
-  (void)nodes;
-  // Pack bind group is rebuilt inline in render() to use current node views.
+void GBufferEffect::wire_dag(const std::vector<EffectDAGNode>& dag) {
+  const std::string out = find_downstream_output(dag);
+  // "sink" is an external view (no owned texture) — not a valid copy source.
+  if (out != "sink") cnn_output_node_ = out;
+}
+
+void GBufferEffect::post_render(WGPUCommandEncoder encoder, NodeRegistry& nodes) {
+  if (cnn_output_node_.empty() || !nodes.has_node(cnn_output_node_)) return;
+  WGPUTexture src_tex = nodes.get_texture(cnn_output_node_);
+  if (!src_tex) return;  // external view (e.g. sink) — no owned texture to copy
+  WGPUTexelCopyTextureInfo src = {};
+  src.texture  = src_tex;
+  src.mipLevel = 0;
+  WGPUTexelCopyTextureInfo dst = {};
+  dst.texture  = nodes.get_texture(node_prev_tex_);
+  dst.mipLevel = 0;
+  WGPUExtent3D extent = {(uint32_t)width_, (uint32_t)height_, 1};
+  wgpuCommandEncoderCopyTextureToTexture(encoder, &src, &dst, &extent);
 }
+
diff --git a/cnn_v3/src/gbuffer_effect.h b/cnn_v3/src/gbuffer_effect.h
index c39219b..76d4347 100644
--- a/cnn_v3/src/gbuffer_effect.h
+++ b/cnn_v3/src/gbuffer_effect.h
@@ -46,6 +46,13 @@ class GBufferEffect : public Effect {
   void render(WGPUCommandEncoder encoder, const UniformsSequenceParams& params,
               NodeRegistry& nodes) override;
 
+  // Copies cnn_output_node_ → node_prev_tex_ after all effects have rendered.
+  void post_render(WGPUCommandEncoder encoder, NodeRegistry& nodes) override;
+
+  // Auto-wires cnn_output_node_: finds the first downstream effect whose
+  // input_nodes intersect our output_nodes, and uses its output_nodes[0].
+  void wire_dag(const std::vector<EffectDAGNode>& dag) override;
+
   // Populate the internal scene with ~20 rotating cubes and a few pumping
   // spheres.  Must be called once before the first render().
   void set_scene();
@@ -68,8 +75,16 @@ class GBufferEffect : public Effect {
   std::string node_depth_;
   std::string node_shadow_;
   std::string node_transp_;
-  std::string node_feat0_;
-  std::string node_feat1_;
+  std::string node_prev_tex_;    // persistent prev-frame CNN output (rgba8unorm)
+
+  // Name of the CNN effect's output node; set by caller before first render.
+  // When non-empty, the CNN output is copied into node_prev_tex_ each frame.
+  std::string cnn_output_node_;
+
+ public:
+  void set_cnn_output_node(const std::string& name) { cnn_output_node_ = name; }
+
+ private:
 
   // Owned scene and camera — populated by set_scene()
   Scene  scene_;
@@ -88,7 +103,6 @@ class GBufferEffect : public Effect {
 
   // Pass 4: Pack compute pipeline
   ComputePipeline  pack_pipeline_;
-  BindGroup        pack_bind_group_;
   UniformBuffer<GBufResUniforms>    pack_res_uniform_;
   UniformBuffer<GBufLightsUniforms> lights_uniform_;
 
@@ -102,7 +116,6 @@ class GBufferEffect : public Effect {
   void create_pack_pipeline();
 
   void update_raster_bind_group(NodeRegistry& nodes);
-  void update_pack_bind_group(NodeRegistry& nodes);
 
   void upload_scene_data(const Scene& scene, const Camera& camera,
                          float time);
diff --git a/cnn_v3/tools/shaders.js b/cnn_v3/tools/shaders.js
index f178637..6c49864 100644
--- a/cnn_v3/tools/shaders.js
+++ b/cnn_v3/tools/shaders.js
@@ -272,6 +272,10 @@ const FULL_PACK_SHADER=`
 @group(0) @binding(5) var transp: texture_2d<f32>;
 @group(0) @binding(6) var f0:     texture_storage_2d<rgba32uint,write>;
 @group(0) @binding(7) var f1:     texture_storage_2d<rgba32uint,write>;
+fn oct_decode(f:vec2f)->vec3f{
+  var n=vec3f(f.x,f.y,1.-abs(f.x)-abs(f.y));
+  if(n.z<0.){n.x=(1.-abs(f.y))*sign(f.x); n.y=(1.-abs(f.x))*sign(f.y);}
+  return normalize(n);}
 fn ld(c:vec2i,d:vec2i)->f32{return textureLoad(depth,clamp(c,vec2i(0),d-vec2i(1)),0).r;}
 fn b2(tl:vec2i,d:vec2i)->vec3f{
   var s=vec3f(0.);
@@ -299,9 +303,10 @@ fn main(@builtin(global_invocation_id) id:vec3u){
   let mid=textureLoad(matid,c,0).r;
   let shd=textureLoad(shadow,c,0).r;
   let trp=textureLoad(transp,c,0).r;
+  let dif=max(0.,dot(oct_decode(oct),vec3f(0.408,0.816,0.408)))*shd;
   let m1=b2(c-vec2i(0),d); let m2=b4(c-vec2i(1),d);
   textureStore(f1,c,vec4u(
     pack4x8unorm(vec4f(mid,0.,0.,0.)),
     pack4x8unorm(vec4f(m1.r,m1.g,m1.b,m2.r)),
-    pack4x8unorm(vec4f(m2.g,m2.b,shd,trp)),
+    pack4x8unorm(vec4f(m2.g,m2.b,dif,trp)),
     0u));}`;
diff --git a/cnn_v3/training/cnn_v3_utils.py b/cnn_v3/training/cnn_v3_utils.py
index 5a3d56c..bef4091 100644
--- a/cnn_v3/training/cnn_v3_utils.py
+++ b/cnn_v3/training/cnn_v3_utils.py
@@ -11,7 +11,7 @@ Imported by train_cnn_v3.py and export_cnn_v3_weights.py.
   [9-11]  prev.rgb          f32 (zero during training)
   [12-14] mip1.rgb          pyrdown(albedo)
   [15-17] mip2.rgb          pyrdown(mip1)
-  [18]    shadow            f32 [0,1]
+  [18]    dif               f32 [0,1]  max(0,dot(normal,KEY_LIGHT))*shadow
   [19]    transp            f32 [0,1]
 
 Sample directory layout (per sample_xxx/):
@@ -48,10 +48,11 @@ from torch.utils.data import Dataset
 N_FEATURES = 20
 
 GEOMETRIC_CHANNELS = [3, 4, 5, 6, 7]   # normal.xy, depth, depth_grad.xy
-CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, shadow, transp
+CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, dif, transp
 TEMPORAL_CHANNELS  = [9, 10, 11]        # prev.rgb
 
-_LUMA = np.array([0.2126, 0.7152, 0.0722], dtype=np.float32)  # BT.709
+_LUMA      = np.array([0.2126, 0.7152, 0.0722], dtype=np.float32)  # BT.709
+_KEY_LIGHT = np.array([0.408,  0.816,  0.408 ], dtype=np.float32)  # normalize(1,2,1)
 
 # ---------------------------------------------------------------------------
 # Image I/O
@@ -102,6 +103,21 @@ def depth_gradient(depth: np.ndarray) -> np.ndarray:
     return np.stack([dzdx, dzdy], axis=-1)
 
 
+def oct_decode(enc: np.ndarray) -> np.ndarray:
+    """Decode oct-encoded normals (H,W,2) in [0,1] → (H,W,3) unit normals."""
+    f = enc * 2.0 - 1.0  # [0,1] → [-1,1]
+    z = 1.0 - np.abs(f[..., :1]) - np.abs(f[..., 1:2])
+    n = np.concatenate([f, z], axis=-1)
+    neg = n[..., 2:3] < 0.0
+    n = np.concatenate([
+        np.where(neg, (1.0 - np.abs(f[..., 1:2])) * np.sign(f[..., :1]),  n[..., :1]),
+        np.where(neg, (1.0 - np.abs(f[..., :1]))  * np.sign(f[..., 1:2]), n[..., 1:2]),
+        n[..., 2:3],
+    ], axis=-1)
+    length = np.linalg.norm(n, axis=-1, keepdims=True)
+    return n / np.maximum(length, 1e-8)
+
+
 def _upsample_nearest(a: np.ndarray, h: int, w: int) -> np.ndarray:
     """Nearest-neighbour upsample (H,W,C) f32 to (h,w,C) — pure numpy, no precision loss."""
     sh, sw = a.shape[:2]
@@ -117,25 +133,29 @@ def assemble_features(albedo: np.ndarray, normal: np.ndarray,
 
     prev set to zero (no temporal history during training).
     mip1/mip2 computed from albedo.  depth_grad computed via finite diff.
+    dif (ch18) = max(0, dot(oct_decode(normal), KEY_LIGHT)) * shadow.
     """
     h, w = albedo.shape[:2]
 
-    mip1 = _upsample_nearest(pyrdown(albedo), h, w)
-    mip2 = _upsample_nearest(pyrdown(pyrdown(albedo)), h, w)
-    dgrad = depth_gradient(depth)
-    prev  = np.zeros((h, w, 3), dtype=np.float32)
+    mip1    = _upsample_nearest(pyrdown(albedo), h, w)
+    mip2    = _upsample_nearest(pyrdown(pyrdown(albedo)), h, w)
+    dgrad   = depth_gradient(depth)
+    prev    = np.zeros((h, w, 3), dtype=np.float32)
+    nor3    = oct_decode(normal)
+    diffuse = np.maximum(0.0, (nor3 * _KEY_LIGHT).sum(-1))
+    dif     = diffuse * shadow
 
     return np.concatenate([
-        albedo,            # [0-2]   albedo.rgb
-        normal,            # [3-4]   normal.xy
-        depth[..., None],  # [5]     depth
-        dgrad,             # [6-7]   depth_grad.xy
-        matid[..., None],  # [8]     mat_id
-        prev,              # [9-11]  prev.rgb
-        mip1,              # [12-14] mip1.rgb
-        mip2,              # [15-17] mip2.rgb
-        shadow[..., None], # [18]    shadow
-        transp[..., None], # [19]    transp
+        albedo,           # [0-2]   albedo.rgb
+        normal,           # [3-4]   normal.xy
+        depth[..., None], # [5]     depth
+        dgrad,            # [6-7]   depth_grad.xy
+        matid[..., None], # [8]     mat_id
+        prev,             # [9-11]  prev.rgb
+        mip1,             # [12-14] mip1.rgb
+        mip2,             # [15-17] mip2.rgb
+        dif[..., None],   # [18]    dif = diffuse * shadow
+        transp[..., None],# [19]    transp
     ], axis=-1).astype(np.float32)