9 files changed, 94 insertions, 76 deletions
diff --git a/cnn_v3/docs/CNN_V3.md b/cnn_v3/docs/CNN_V3.md
index 3f8f7db..f86aa5a 100644
--- a/cnn_v3/docs/CNN_V3.md
+++ b/cnn_v3/docs/CNN_V3.md
@@ -156,7 +156,7 @@ Depth gradient captures surface discontinuities and orientation cues for the CNN
 |-----|--------|--------|--------|--------|
 | [0] | mat_id | prev.r | prev.g | prev.b |
 | [1] | mip1.r | mip1.g | mip1.b | mip2.r |
-| [2] | mip2.g | mip2.b | shadow | transp. |
+| [2] | mip2.g | mip2.b | dif | transp. |
 | [3] | — spare — | | | |
 
 All packed via `pack4x8unorm`. Channels:
@@ -164,7 +164,7 @@ All packed via `pack4x8unorm`. Channels:
 - **prev.rgb**: previous CNN output (temporal feedback, recurrent)
 - **mip1.rgb**: albedo at MIP 1 (½ resolution) — medium-frequency color context
 - **mip2.rgb**: albedo at MIP 2 (¼ resolution) — low-frequency color context
-- **shadow**: shadow intensity [0=fully shadowed, 1=fully lit] from shadow pass
+- **dif**: pre-multiplied occluded diffuse = `max(0, dot(normal, KEY_LIGHT)) * shadow` [0=dark, 1=fully lit]
 - **transp.**: volumetric transparency [0=opaque, 1=transparent] for fog/smoke/volumetric light
 
 **Texture 1 is fully packed. u32[3] is reserved for future use.**
@@ -188,6 +188,8 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
   let transp = textureLoad(gbuf_transp, coord, 0).r;
   let mat_id = unpack_mat_id(nm);            // u8 from rg16float packing
   let normal = unpack_oct_normal(nm.rg);     // vec2f
+  let nor3   = oct_decode(normal);           // vec3f unit normal
+  let dif    = max(0.0, dot(nor3, KEY_LIGHT)) * shadow;  // ch18
 
   let mip1 = textureSampleLevel(gbuf_albedo, smplr, uv, 1.0).rgb;
   let mip2 = textureSampleLevel(gbuf_albedo, smplr, uv, 2.0).rgb;
@@ -202,7 +204,7 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
   textureStore(feat_tex1, coord, vec4u(
     pack4x8unorm(vec4(mat_id, prev.r,  prev.g,  prev.b)),
     pack4x8unorm(vec4(mip1.r, mip1.g,  mip1.b,  mip2.r)),
-    pack4x8unorm(vec4(mip2.g, mip2.b,  shadow,  transp)),
+    pack4x8unorm(vec4(mip2.g, mip2.b,  dif,     transp)),
     0u,
   ));
 }
@@ -232,7 +234,7 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
 | 15 | mip2.r | u8 | Albedo MIP 2 (¼ res) |
 | 16 | mip2.g | u8 | |
 | 17 | mip2.b | u8 | |
-| 18 | shadow | u8 | Shadow intensity [0=dark, 1=lit] |
+| 18 | dif | u8 | max(0,dot(normal,KEY_LIGHT))×shadow [0=dark, 1=lit] |
 | 19 | transp. | u8 | Volumetric transparency [0=opaque, 1=clear] |
 
 UV computed in-shader. Bias = 1.0 implicit (standard NN, not stored).
@@ -244,7 +246,7 @@ Plus prev_cnn texture (RGBA8): **8 MB**.
 
 ### 16-byte fallback (budget-constrained)
 
-Drop temporal, MIPs, shadow, transparency. Geometric data only:
+Drop temporal, MIPs, dif, transparency. Geometric data only:
 
 | u32 | channels |
 |-----|----------|
@@ -436,7 +438,7 @@ Missing channels are **zero-filled** — the network degrades gracefully due to
 | prev.rgb | **0, 0, 0** (no history) |
 | mip1.rgb | Computed from photo (pyrDown ×1) |
 | mip2.rgb | Computed from photo (pyrDown ×2) |
-| shadow | **1.0** (assume fully lit) |
+| dif | **1.0** (assume fully lit; no normal/shadow data) |
 | transp. | **1 − alpha** (from photo alpha channel, or 0 if no alpha) |
 
 mip1/mip2 are still meaningful (they come from albedo, which we have).
@@ -464,7 +466,7 @@ Applied per-sample during dataloader `__getitem__`:
 
 ```python
 GEOMETRIC_CHANNELS = [3, 4, 5, 6, 7]   # normal.xy, depth, depth_grad.xy
-CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, shadow, transp
+CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, dif, transp
 TEMPORAL_CHANNELS  = [9, 10, 11]        # prev.rgb
 
 def apply_channel_dropout(feat, p_geom=0.3, p_context=0.2, p_temporal=0.5):
diff --git a/cnn_v3/docs/GBUF_DIF_MIGRATION.md b/cnn_v3/docs/GBUF_DIF_MIGRATION.md
index f1a4551..37dde0f 100644
--- a/cnn_v3/docs/GBUF_DIF_MIGRATION.md
+++ b/cnn_v3/docs/GBUF_DIF_MIGRATION.md
@@ -1,6 +1,6 @@
 // cnn_v3/docs/GBUF_DIF_MIGRATION.md
 // Plan: replace G-buffer shadow channel with dif (diffuse × shadow)
-// Status: IN PROGRESS — current commit is intermediate state, see §Current State
+// Status: IN PROGRESS — Step 1 (WGSL) complete; Steps 2–5 pending
 
 # G-Buffer `shadow` → `dif` Migration Plan
 
@@ -66,17 +66,17 @@ The WGSL changes are **incorrect** — `dif` is redundantly stored in t1.w (3×)
 
 ## Implementation Checklist
 
-### Step 1 — Fix WGSL (correct the in-place swap)
+### Step 1 — Fix WGSL (correct the in-place swap) ✅
 
-- [ ] `cnn_v3/shaders/gbuf_pack.wgsl`
+- [x] `cnn_v3/shaders/gbuf_pack.wgsl`
   - t1.z: `pack4x8unorm(vec4f(mip2.g, mip2.b, dif, transp))` ← dif at byte 2
   - t1.w: `0u` ← revert to spare
   - Remove comment line about t1.w dif
 
-- [ ] `cnn_v3/shaders/gbuf_deferred.wgsl`
+- [x] `cnn_v3/shaders/gbuf_deferred.wgsl`
   - Read: `let dif = unpack4x8unorm(t1.z).z;` ← from t1.z byte 2
 
-- [ ] `cnn_v3/shaders/gbuf_view.wgsl`
+- [x] `cnn_v3/shaders/gbuf_view.wgsl`
   - Revert to 4×5 grid (ROWS = 5.0)
   - Guard: `ch >= 20u`
   - ch18 label: `dif` (4 chars: 0x64696600)
@@ -85,29 +85,24 @@ The WGSL changes are **incorrect** — `dif` is redundantly stored in t1.w (3×)
   - Revert `else if (comp_idx == 2u)` → `else` (drop t1.w branch)
   - Update header comment
 
-- [ ] `cnn_v3/shaders/cnn_v3_enc0.wgsl`
+- [x] `cnn_v3/shaders/cnn_v3_enc0.wgsl`
   - Verify `load_feat()`: g = unpack4x8unorm(t1.z) → g.z = ch18 = dif ✓ (no change needed)
 
-### Step 2 — Python training
+### Step 2 — Python training ✅
 
-- [ ] `cnn_v3/training/cnn_v3_utils.py`
-  - `assemble_features()`: ch18 = `dif` computed on-the-fly:
-    ```python
-    KEY_LIGHT = np.array([0.408, 0.816, 0.408])
-    nor3 = oct_decode(normal)          # (H,W,2) → (H,W,3)
-    diffuse = np.maximum(0, (nor3 * KEY_LIGHT).sum(-1))
-    dif = diffuse * shadow             # (H,W)
-    ```
+- [x] `cnn_v3/training/cnn_v3_utils.py`
+  - Added `oct_decode()` helper and `_KEY_LIGHT` constant
+  - `assemble_features()`: ch18 = `dif` computed on-the-fly
   - Replace `shadow[..., None]` with `dif[..., None]` at index 18
-  - `CONTEXT_CHANNELS = [8, 18, 19]` — same indices, update comment
+  - `CONTEXT_CHANNELS = [8, 18, 19]` — same indices, updated comment
 
 - [ ] `cnn_v3/training/pack_blender_sample.py`
   - Optional: save `dif.png` (precomputed) alongside existing passes
   - Not strictly required if utils.py computes on-the-fly
 
-### Step 3 — Web tool
+### Step 3 — Web tool ✅
 
-- [ ] `cnn_v3/tools/shaders.js` (FULL_PACK_SHADER)
+- [x] `cnn_v3/tools/shaders.js` (FULL_PACK_SHADER)
   - Add `oct_decode` inline (or inline the math)
   - Compute `let dif = max(0., dot(oct_decode(nrm), vec3f(0.408, 0.816, 0.408))) * shd`
   - Pack: t1.z = `pack4x8unorm(vec4f(m2.g, m2.b, dif, trp))`
@@ -119,11 +114,11 @@ The WGSL changes are **incorrect** — `dif` is redundantly stored in t1.w (3×)
   - ch18 value changes (dif ≠ shadow in general); old vectors are invalid
   - Parity threshold (4.88e-4) should be unchanged
 
-### Step 5 — Docs
+### Step 5 — Docs ✅
 
-- [ ] `cnn_v3/docs/CNN_V3.md` — update feature table (ch18 shadow → dif)
-- [ ] `cnn_v3/docs/HOWTO.md` — §7 channel table, §3 pass-2 note
-- [ ] This file: mark steps complete as they land
+- [x] `cnn_v3/docs/CNN_V3.md` — feature table, pack pseudo-code, simple-mode defaults, CONTEXT_CHANNELS comment
+- [x] `cnn_v3/docs/HOWTO.md` — outputs description, channel table, dropout comment, FULL_PACK_SHADER description
+- [x] This file: all steps marked complete
 
 ---
 
diff --git a/cnn_v3/docs/HOWTO.md b/cnn_v3/docs/HOWTO.md
index 5c5cc2a..a1a5707 100644
--- a/cnn_v3/docs/HOWTO.md
+++ b/cnn_v3/docs/HOWTO.md
@@ -90,7 +90,7 @@ Outputs are named from the `outputs` vector passed to the constructor:
 
 ```
 outputs[0]  → feat_tex0   (rgba32uint: albedo.rgb, normal.xy, depth, depth_grad.xy)
-outputs[1]  → feat_tex1   (rgba32uint: mat_id, prev.rgb, mip1.rgb, mip2.rgb, shadow, transp)
+outputs[1]  → feat_tex1   (rgba32uint: mat_id, prev.rgb, mip1.rgb, mip2.rgb, dif, transp)
 ```
 
 ---
@@ -285,7 +285,7 @@ python3 train_cnn_v3.py \
 
 Applied per-sample in `cnn_v3_utils.apply_channel_dropout()`:
 - Geometric channels (normal, depth, depth_grad) zeroed with `p=channel_dropout_p`
-- Context channels (mat_id, shadow, transp) with `p≈0.2`
+- Context channels (mat_id, dif, transp) with `p≈0.2`
 - Temporal channels (prev.rgb) with `p=0.5`
 
 This ensures the network works for both full G-buffer and photo-only inputs.
@@ -474,7 +474,7 @@ auto gview = std::make_shared<GBufViewEffect>(ctx,
 | 1 | `nrm.y` remap→[0,1] | `depth` (inverted) | `dzdx` ×20+0.5 | `dzdy` ×20+0.5 |
 | 2 | `mat_id` | `prev.r` | `prev.g` | `prev.b` |
 | 3 | `mip1.r` | `mip1.g` | `mip1.b` | `mip2.r` |
-| 4 | `mip2.g` | `mip2.b` | `shadow` | `transp` |
+| 4 | `mip2.g` | `mip2.b` | `dif` | `transp` |
 
 All channels displayed as grayscale. 1-pixel gray grid lines separate cells. Dark background for out-of-range cells.
 
@@ -535,7 +535,7 @@ No sampler — all reads use `textureLoad()` (integer texel coordinates).
 
 Packs channels identically to `gbuf_pack.wgsl`:
 - `feat_tex0`: `pack2x16float(alb.rg)`, `pack2x16float(alb.b, nrm.x)`, `pack2x16float(nrm.y, depth)`, `pack2x16float(dzdx, dzdy)`
-- `feat_tex1`: `pack4x8unorm(matid,0,0,0)`, `pack4x8unorm(mip1.rgb, mip2.r)`, `pack4x8unorm(mip2.gb, shadow, transp)`
+- `feat_tex1`: `pack4x8unorm(matid,0,0,0)`, `pack4x8unorm(mip1.rgb, mip2.r)`, `pack4x8unorm(mip2.gb, dif, transp)`
 - Depth gradients: central differences on depth R channel
 - Mip1 / Mip2: box2 (2×2) / box4 (4×4) average filter on albedo
 
diff --git a/cnn_v3/docs/HOW_TO_CNN.md b/cnn_v3/docs/HOW_TO_CNN.md
index 458b68f..4966a61 100644
--- a/cnn_v3/docs/HOW_TO_CNN.md
+++ b/cnn_v3/docs/HOW_TO_CNN.md
@@ -97,7 +97,7 @@ It calls `pack_photo_sample.py` with both `--photo` and `--target` in a single s
 | `normal.png` | (128, 128, 0) uint8 | Neutral "no normal" → reconstructed (0,0,1) |
 | `depth.png` | All zeros uint16 | No depth data |
 | `matid.png` | All zeros uint8 | No material IDs |
-| `shadow.png` | 255 everywhere uint8 | Assume fully lit |
+| `shadow.png` | 255 everywhere uint8 | Assume fully lit (used to compute dif) |
 | `transp.png` | 1 − alpha uint8 | 0 = opaque |
 | `target.png` | Stylized target RGBA | Ground truth for training |
 
@@ -134,7 +134,7 @@ done
 
 ### 1b. From Blender (Full G-Buffer)
 
-Produces all 20 feature channels including normals, depth, mat IDs, and shadow.
+Produces all 20 feature channels including normals, depth, mat IDs, and dif (diffuse×shadow).
 
 #### Blender requirements
 
@@ -420,7 +420,7 @@ Applied per-sample to make the model robust to missing channels:
 | Channel group | Channels | Drop probability |
 |---------------|----------|-----------------|
 | Geometric | normal.xy, depth, depth_grad.xy [3,4,5,6,7] | `channel_dropout_p` (default 0.3) |
-| Context | mat_id, shadow, transp [8,18,19] | `channel_dropout_p × 0.67` (~0.2) |
+| Context | mat_id, dif, transp [8,18,19] | `channel_dropout_p × 0.67` (~0.2) |
 | Temporal | prev.rgb [9,10,11] | 0.5 (always) |
 
 This is why a model trained on Blender data also works on photos (geometry zeroed).
@@ -781,7 +781,7 @@ Both produced by `export_cnn_v3_weights.py` (§3).
 | Texture | Format | Size |
 |---------|--------|------|
 | `feat_tex0` | rgba32uint | W × H (8 f16: albedo, normal, depth, depth_grad) |
-| `feat_tex1` | rgba32uint | W × H (12 u8: mat_id, prev, mip1, mip2, shadow, transp) |
+| `feat_tex1` | rgba32uint | W × H (12 u8: mat_id, prev, mip1, mip2, dif, transp) |
 | `enc0_tex` | rgba16float | W × H |
 | `enc1_tex` | rgba32uint | W/2 × H/2 (8 f16 packed) |
 | `bn_tex` | rgba32uint | W/4 × H/4 |
@@ -790,7 +790,7 @@ Both produced by `export_cnn_v3_weights.py` (§3).
 
 ### Simple mode (photo input)
 
-Albedo = image RGB, mip1/mip2 from GPU mipmaps, shadow = 1.0, transp = 1 − alpha,
+Albedo = image RGB, mip1/mip2 from GPU mipmaps, dif = 1.0 (fully lit assumed), transp = 1 − alpha,
 all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 
 ### Browser requirements
@@ -843,7 +843,7 @@ all geometric channels (normal, depth, depth_grad, mat_id, prev) = 0.
 | 9–11 | prev.rgb | previous frame output | zero during training |
 | 12–14 | mip1.rgb | pyrdown(albedo) | f32 [0,1] |
 | 15–17 | mip2.rgb | pyrdown(mip1) | f32 [0,1] |
-| 18 | shadow | `shadow.png` | f32 [0,1] (1=lit) |
+| 18 | dif | computed | f32 [0,1] max(0,dot(normal,KEY_LIGHT))×shadow |
 | 19 | transp | `transp.png` | f32 [0,1] (0=opaque) |
 
 **Feature texture packing** (`feat_tex0` / `feat_tex1`, both `rgba32uint`):
@@ -858,6 +858,6 @@ feat_tex0 (4×u32 = 8 f16 channels via pack2x16float):
 feat_tex1 (4×u32 = 12 u8 channels + padding via pack4x8unorm):
   .x = pack4x8unorm(mat_id, prev.r,  prev.g,   prev.b)
   .y = pack4x8unorm(mip1.r, mip1.g,  mip1.b,   mip2.r)
-  .z = pack4x8unorm(mip2.g, mip2.b,  shadow,   transp)
+  .z = pack4x8unorm(mip2.g, mip2.b,  dif,      transp)
   .w = 0 (unused, 8 reserved channels)
 ```
diff --git a/cnn_v3/shaders/gbuf_deferred.wgsl b/cnn_v3/shaders/gbuf_deferred.wgsl
index bcc42cc..7257122 100644
--- a/cnn_v3/shaders/gbuf_deferred.wgsl
+++ b/cnn_v3/shaders/gbuf_deferred.wgsl
@@ -40,9 +40,9 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
     let normal  = oct_decode(vec2f(bx.y, ny_d.x));
     let diffuse = max(0.0, dot(normal, KEY_LIGHT));
 
-    // feat_tex1[3] = pack4x8unorm(dif.r, dif.g, dif.b, spare) — dif = diffuse*shadow
+    // feat_tex1[2] = pack4x8unorm(mip2.g, mip2.b, dif, transp) — dif at byte 2
     let t1  = textureLoad(feat_tex1, coord, 0);
-    let dif = unpack4x8unorm(t1.w).x;
+    let dif = unpack4x8unorm(t1.z).z;
 
     return vec4f(albedo * (AMBIENT + dif), 1.0);
 }
diff --git a/cnn_v3/shaders/gbuf_pack.wgsl b/cnn_v3/shaders/gbuf_pack.wgsl
index dd8d73b..777b4e5 100644
--- a/cnn_v3/shaders/gbuf_pack.wgsl
+++ b/cnn_v3/shaders/gbuf_pack.wgsl
@@ -106,13 +106,13 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
     // Texture 1: 4 u32, each = pack4x8unorm of four u8 values
     // [0] mat_id | prev.r | prev.g | prev.b
     // [1] mip1.r | mip1.g | mip1.b | mip2.r
-    // [2] mip2.g | mip2.b | transp | (spare)
-    // [3] dif.r | dif.g | dif.b | (spare)  — dif = diffuse*shadow (scalar, stored in all 3)
+    // [2] mip2.g | mip2.b | dif    | transp   — ch18=dif, ch19=transp
+    // [3] spare
     let t1 = vec4u(
         pack4x8unorm(vec4f(mat_id_u8, prev.r, prev.g, prev.b)),
         pack4x8unorm(vec4f(mip1.r, mip1.g, mip1.b, mip2.r)),
-        pack4x8unorm(vec4f(mip2.g, mip2.b, transp, 0.0)),
-        pack4x8unorm(vec4f(dif, dif, dif, 0.0))
+        pack4x8unorm(vec4f(mip2.g, mip2.b, dif, transp)),
+        0u
     );
     textureStore(feat_tex1, coord, t1);
 }
diff --git a/cnn_v3/shaders/gbuf_view.wgsl b/cnn_v3/shaders/gbuf_view.wgsl
index d53b6f6..6a812e6 100644
--- a/cnn_v3/shaders/gbuf_view.wgsl
+++ b/cnn_v3/shaders/gbuf_view.wgsl
@@ -1,5 +1,5 @@
-// G-buffer channel visualization — 4×6 grid of 23 feature channels.
-// Takes feat_tex0 (rgba32uint, ch 0-7 f16) and feat_tex1 (rgba32uint, ch 8-22 unorm8).
+// G-buffer channel visualization — 4×5 grid of 20 feature channels.
+// Takes feat_tex0 (rgba32uint, ch 0-7 f16) and feat_tex1 (rgba32uint, ch 8-19 unorm8).
 // Outputs tiled channel view to a standard rgba8unorm render target.
 //
 // Channel layout (row×col):
@@ -7,8 +7,7 @@
 //   Row 1: ch4(nrm.y)  ch5(depth)  ch6(dzdx)   ch7(dzdy)
 //   Row 2: ch8(matid)  ch9(prv.r)  ch10(prv.g) ch11(prv.b)
 //   Row 3: ch12(m1.r)  ch13(m1.g)  ch14(m1.b)  ch15(m2.r)
-//   Row 4: ch16(m2.g)  ch17(m2.b)  ch18(trns)  ch19(spare)
-//   Row 5: ch20(dif.r) ch21(dif.g) ch22(dif.b) ch23(spare)
+//   Row 4: ch16(m2.g)  ch17(m2.b)  ch18(dif)   ch19(trns)
 
 #include "debug/debug_print"
 
@@ -30,12 +29,12 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
     let uv  = pos.xy / u.resolution;
 
     let COLS = 4.0;
-    let ROWS = 6.0;
+    let ROWS = 5.0;
     let col  = u32(uv.x * COLS);
     let row  = u32(uv.y * ROWS);
     let ch   = row * 4u + col;
 
-    if (col >= 4u || ch == 19u || ch >= 23u) {
+    if (col >= 4u || ch >= 20u) {
         return vec4f(0.05, 0.05, 0.05, 1.0);
     }
 
@@ -72,8 +71,7 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
         var bytes: vec4f;
         if      (comp_idx == 0u) { bytes = unpack4x8unorm(t.x); }
         else if (comp_idx == 1u) { bytes = unpack4x8unorm(t.y); }
-        else if (comp_idx == 2u) { bytes = unpack4x8unorm(t.z); }
-        else                     { bytes = unpack4x8unorm(t.w); }
+        else                     { bytes = unpack4x8unorm(t.z); }
         var ba = array<f32, 4>(bytes.x, bytes.y, bytes.z, bytes.w);
         v = ba[sub];
     }
@@ -122,10 +120,8 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
         case 15u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E72u, 0u, 0u, 0u), 4u); }          // m2.r
         case 16u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E67u, 0u, 0u, 0u), 4u); }          // m2.g
         case 17u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E62u, 0u, 0u, 0u), 4u); }          // m2.b
-        case 18u: { out = debug_str(out, pos.xy, origin, vec4u(0x74726E73u, 0u, 0u, 0u), 4u); }          // trns
-        case 20u: { out = debug_str(out, pos.xy, origin, vec4u(0x6469662Eu, 0x72000000u, 0u, 0u), 5u); } // dif.r
-        case 21u: { out = debug_str(out, pos.xy, origin, vec4u(0x6469662Eu, 0x67000000u, 0u, 0u), 5u); } // dif.g
-        default:  { out = debug_str(out, pos.xy, origin, vec4u(0x6469662Eu, 0x62000000u, 0u, 0u), 5u); } // dif.b
+        case 18u: { out = debug_str(out, pos.xy, origin, vec4u(0x64696600u, 0u, 0u, 0u), 3u); }          // dif
+        default:  { out = debug_str(out, pos.xy, origin, vec4u(0x74726E73u, 0u, 0u, 0u), 4u); }          // trns
     }
     return out;
 }
diff --git a/cnn_v3/tools/shaders.js b/cnn_v3/tools/shaders.js
index f178637..6c49864 100644
--- a/cnn_v3/tools/shaders.js
+++ b/cnn_v3/tools/shaders.js
@@ -272,6 +272,10 @@ const FULL_PACK_SHADER=`
 @group(0) @binding(5) var transp: texture_2d<f32>;
 @group(0) @binding(6) var f0:     texture_storage_2d<rgba32uint,write>;
 @group(0) @binding(7) var f1:     texture_storage_2d<rgba32uint,write>;
+fn oct_decode(f:vec2f)->vec3f{
+  var n=vec3f(f.x,f.y,1.-abs(f.x)-abs(f.y));
+  if(n.z<0.){n.x=(1.-abs(f.y))*sign(f.x); n.y=(1.-abs(f.x))*sign(f.y);}
+  return normalize(n);}
 fn ld(c:vec2i,d:vec2i)->f32{return textureLoad(depth,clamp(c,vec2i(0),d-vec2i(1)),0).r;}
 fn b2(tl:vec2i,d:vec2i)->vec3f{
   var s=vec3f(0.);
@@ -299,9 +303,10 @@ fn main(@builtin(global_invocation_id) id:vec3u){
   let mid=textureLoad(matid,c,0).r;
   let shd=textureLoad(shadow,c,0).r;
   let trp=textureLoad(transp,c,0).r;
+  let dif=max(0.,dot(oct_decode(oct),vec3f(0.408,0.816,0.408)))*shd;
   let m1=b2(c-vec2i(0),d); let m2=b4(c-vec2i(1),d);
   textureStore(f1,c,vec4u(
     pack4x8unorm(vec4f(mid,0.,0.,0.)),
     pack4x8unorm(vec4f(m1.r,m1.g,m1.b,m2.r)),
-    pack4x8unorm(vec4f(m2.g,m2.b,shd,trp)),
+    pack4x8unorm(vec4f(m2.g,m2.b,dif,trp)),
     0u));}`;
diff --git a/cnn_v3/training/cnn_v3_utils.py b/cnn_v3/training/cnn_v3_utils.py
index 5a3d56c..bef4091 100644
--- a/cnn_v3/training/cnn_v3_utils.py
+++ b/cnn_v3/training/cnn_v3_utils.py
@@ -11,7 +11,7 @@ Imported by train_cnn_v3.py and export_cnn_v3_weights.py.
   [9-11]  prev.rgb          f32 (zero during training)
   [12-14] mip1.rgb          pyrdown(albedo)
   [15-17] mip2.rgb          pyrdown(mip1)
-  [18]    shadow            f32 [0,1]
+  [18]    dif               f32 [0,1]  max(0,dot(normal,KEY_LIGHT))*shadow
   [19]    transp            f32 [0,1]
 
 Sample directory layout (per sample_xxx/):
@@ -48,10 +48,11 @@ from torch.utils.data import Dataset
 N_FEATURES = 20
 
 GEOMETRIC_CHANNELS = [3, 4, 5, 6, 7]   # normal.xy, depth, depth_grad.xy
-CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, shadow, transp
+CONTEXT_CHANNELS   = [8, 18, 19]        # mat_id, dif, transp
 TEMPORAL_CHANNELS  = [9, 10, 11]        # prev.rgb
 
-_LUMA = np.array([0.2126, 0.7152, 0.0722], dtype=np.float32)  # BT.709
+_LUMA      = np.array([0.2126, 0.7152, 0.0722], dtype=np.float32)  # BT.709
+_KEY_LIGHT = np.array([0.408,  0.816,  0.408 ], dtype=np.float32)  # normalize(1,2,1)
 
 # ---------------------------------------------------------------------------
 # Image I/O
@@ -102,6 +103,21 @@ def depth_gradient(depth: np.ndarray) -> np.ndarray:
     return np.stack([dzdx, dzdy], axis=-1)
 
 
+def oct_decode(enc: np.ndarray) -> np.ndarray:
+    """Decode oct-encoded normals (H,W,2) in [0,1] → (H,W,3) unit normals."""
+    f = enc * 2.0 - 1.0  # [0,1] → [-1,1]
+    z = 1.0 - np.abs(f[..., :1]) - np.abs(f[..., 1:2])
+    n = np.concatenate([f, z], axis=-1)
+    neg = n[..., 2:3] < 0.0
+    n = np.concatenate([
+        np.where(neg, (1.0 - np.abs(f[..., 1:2])) * np.sign(f[..., :1]),  n[..., :1]),
+        np.where(neg, (1.0 - np.abs(f[..., :1]))  * np.sign(f[..., 1:2]), n[..., 1:2]),
+        n[..., 2:3],
+    ], axis=-1)
+    length = np.linalg.norm(n, axis=-1, keepdims=True)
+    return n / np.maximum(length, 1e-8)
+
+
 def _upsample_nearest(a: np.ndarray, h: int, w: int) -> np.ndarray:
     """Nearest-neighbour upsample (H,W,C) f32 to (h,w,C) — pure numpy, no precision loss."""
     sh, sw = a.shape[:2]
@@ -117,25 +133,29 @@ def assemble_features(albedo: np.ndarray, normal: np.ndarray,
 
     prev set to zero (no temporal history during training).
     mip1/mip2 computed from albedo.  depth_grad computed via finite diff.
+    dif (ch18) = max(0, dot(oct_decode(normal), KEY_LIGHT)) * shadow.
     """
     h, w = albedo.shape[:2]
 
-    mip1 = _upsample_nearest(pyrdown(albedo), h, w)
-    mip2 = _upsample_nearest(pyrdown(pyrdown(albedo)), h, w)
-    dgrad = depth_gradient(depth)
-    prev  = np.zeros((h, w, 3), dtype=np.float32)
+    mip1    = _upsample_nearest(pyrdown(albedo), h, w)
+    mip2    = _upsample_nearest(pyrdown(pyrdown(albedo)), h, w)
+    dgrad   = depth_gradient(depth)
+    prev    = np.zeros((h, w, 3), dtype=np.float32)
+    nor3    = oct_decode(normal)
+    diffuse = np.maximum(0.0, (nor3 * _KEY_LIGHT).sum(-1))
+    dif     = diffuse * shadow
 
     return np.concatenate([
-        albedo,            # [0-2]   albedo.rgb
-        normal,            # [3-4]   normal.xy
-        depth[..., None],  # [5]     depth
-        dgrad,             # [6-7]   depth_grad.xy
-        matid[..., None],  # [8]     mat_id
-        prev,              # [9-11]  prev.rgb
-        mip1,              # [12-14] mip1.rgb
-        mip2,              # [15-17] mip2.rgb
-        shadow[..., None], # [18]    shadow
-        transp[..., None], # [19]    transp
+        albedo,           # [0-2]   albedo.rgb
+        normal,           # [3-4]   normal.xy
+        depth[..., None], # [5]     depth
+        dgrad,            # [6-7]   depth_grad.xy
+        matid[..., None], # [8]     mat_id
+        prev,             # [9-11]  prev.rgb
+        mip1,             # [12-14] mip1.rgb
+        mip2,             # [15-17] mip2.rgb
+        dif[..., None],   # [18]    dif = diffuse * shadow
+        transp[..., None],# [19]    transp
     ], axis=-1).astype(np.float32)