5 files changed, 254 insertions, 42 deletions
diff --git a/cnn_v3/shaders/gbuf_deferred.wgsl b/cnn_v3/shaders/gbuf_deferred.wgsl
new file mode 100644
index 0000000..7257122
--- /dev/null
+++ b/cnn_v3/shaders/gbuf_deferred.wgsl
@@ -0,0 +1,48 @@
+// G-buffer deferred render — albedo * diffuse
+// Reads feat_tex0 (rgba32uint, f16 geometric).
+// Outputs albedo * (ambient + diffuse).
+
+#include "math/normal"
+
+@group(0) @binding(0) var feat_tex0: texture_2d<u32>;
+@group(0) @binding(1) var feat_tex1: texture_2d<u32>;
+@group(0) @binding(2) var<uniform> uniforms: GBufDeferredUniforms;
+
+struct GBufDeferredUniforms {
+    resolution: vec2f,
+}
+
+const KEY_LIGHT: vec3f = vec3f(0.408, 0.816, 0.408);  // normalize(1,2,1)
+const AMBIENT:   f32   = 0.15;
+
+@vertex
+fn vs_main(@builtin(vertex_index) vid: u32) -> @builtin(position) vec4f {
+    let x = f32((vid & 1u) << 2u) - 1.0;
+    let y = f32((vid & 2u) << 1u) - 1.0;
+    return vec4f(x, y, 0.0, 1.0);
+}
+
+@fragment
+fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
+    let coord = vec2i(pos.xy);
+
+    // feat_tex0 layout:
+    //   [0] pack2x16float(albedo.r, albedo.g)
+    //   [1] pack2x16float(albedo.b, normal.x)   oct [-1,1]
+    //   [2] pack2x16float(normal.y, depth)       oct [-1,1]
+    let t0     = textureLoad(feat_tex0, coord, 0);
+    let rg     = unpack2x16float(t0.x);
+    let bx     = unpack2x16float(t0.y);
+    let ny_d   = unpack2x16float(t0.z);
+    let albedo = vec3f(rg.x, rg.y, bx.x);
+
+    // Decode oct-normal (stored in [-1,1] — use oct_decode directly)
+    let normal  = oct_decode(vec2f(bx.y, ny_d.x));
+    let diffuse = max(0.0, dot(normal, KEY_LIGHT));
+
+    // feat_tex1[2] = pack4x8unorm(mip2.g, mip2.b, dif, transp) — dif at byte 2
+    let t1  = textureLoad(feat_tex1, coord, 0);
+    let dif = unpack4x8unorm(t1.z).z;
+
+    return vec4f(albedo * (AMBIENT + dif), 1.0);
+}
diff --git a/cnn_v3/shaders/gbuf_pack.wgsl b/cnn_v3/shaders/gbuf_pack.wgsl
index 71d8471..777b4e5 100644
--- a/cnn_v3/shaders/gbuf_pack.wgsl
+++ b/cnn_v3/shaders/gbuf_pack.wgsl
@@ -1,4 +1,5 @@
 // G-buffer pack compute shader for CNN v3
+#include "math/normal"
 // Pass 4: Pack all G-buffer channels into two rgba32uint feature textures (32 bytes/pixel)
 // Output feat_tex0 holds 8×f16 geometric channels; feat_tex1 holds 12×u8 context channels.
 
@@ -44,15 +45,6 @@ fn box4(tl: vec2i) -> vec3f {
     return acc * (1.0 / 16.0);
 }
 
-// Decode oct-normal from [0,1] storage → [-1,1] encoded xy → reconstruct z.
-fn decode_oct_normal(rg: vec2f) -> vec3f {
-    let f = rg * 2.0 - vec2f(1.0);
-    var n = vec3f(f.x, f.y, 1.0 - abs(f.x) - abs(f.y));
-    let t = max(-n.z, 0.0);
-    n.x += select(t, -t, n.x >= 0.0);
-    n.y += select(t, -t, n.y >= 0.0);
-    return normalize(n);
-}
 
 @compute @workgroup_size(8, 8)
 fn pack_features(@builtin(global_invocation_id) id: vec3u) {
@@ -73,7 +65,7 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
 
     // Normal: stored as oct-encoded [0,1] in RG; extract just the encoded xy for feat_tex0
     let normal_enc = nm.rg; // already in [0,1] — decode to get the xy for CNN input
-    let n3         = decode_oct_normal(normal_enc);
+    let n3         = oct_decode_unorm(normal_enc);
     // Store oct-encoded in [-1,1] remapped back to what CNN expects (the [-1,1] oct xy)
     let oct_xy     = normal_enc * 2.0 - vec2f(1.0); // remap [0,1] → [-1,1]
 
@@ -94,6 +86,9 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
     let mat_id_u8 = nm.b;                                       // mat_id already in [0,1]
     let shadow    = textureLoad(gbuf_shadow, coord, 0).r;
     let transp    = textureLoad(gbuf_transp, coord, 0).r;
+    let nor       = oct_decode_unorm(nm.rg);
+    let diffuse   = max(0.0, dot(nor, vec3f(0.408, 0.816, 0.408)));
+    let dif       = diffuse * shadow;
     let prev      = textureSampleLevel(prev_cnn, bilinear_sampler, uv, 0.0).rgb;
 
     // MIP 1: 2×2 box filter (half resolution context)
@@ -111,12 +106,12 @@ fn pack_features(@builtin(global_invocation_id) id: vec3u) {
     // Texture 1: 4 u32, each = pack4x8unorm of four u8 values
     // [0] mat_id | prev.r | prev.g | prev.b
     // [1] mip1.r | mip1.g | mip1.b | mip2.r
-    // [2] mip2.g | mip2.b | shadow | transp
-    // [3] spare  (0)
+    // [2] mip2.g | mip2.b | dif    | transp   — ch18=dif, ch19=transp
+    // [3] spare
     let t1 = vec4u(
         pack4x8unorm(vec4f(mat_id_u8, prev.r, prev.g, prev.b)),
         pack4x8unorm(vec4f(mip1.r, mip1.g, mip1.b, mip2.r)),
-        pack4x8unorm(vec4f(mip2.g, mip2.b, shadow, transp)),
+        pack4x8unorm(vec4f(mip2.g, mip2.b, dif, transp)),
         0u
     );
     textureStore(feat_tex1, coord, t1);
diff --git a/cnn_v3/shaders/gbuf_raster.wgsl b/cnn_v3/shaders/gbuf_raster.wgsl
index c762db2..aed0e72 100644
--- a/cnn_v3/shaders/gbuf_raster.wgsl
+++ b/cnn_v3/shaders/gbuf_raster.wgsl
@@ -1,8 +1,11 @@
 // G-buffer rasterization shader for CNN v3
 // Pass 1: Proxy geometry → MRT (albedo rgba16float, normal_mat rgba16float, depth32)
 // Uses GlobalUniforms, ObjectData, ObjectsBuffer from common_uniforms.
+// SPHERE objects use ray-sphere impostor (correct silhouette + normal + depth).
 
 #include "common_uniforms"
+#include "math/normal"
+#include "ray_sphere"
 
 @group(0) @binding(0) var<uniform> globals: GlobalUniforms;
 @group(0) @binding(1) var<storage, read> object_data: ObjectsBuffer;
@@ -15,21 +18,6 @@ struct VertexOutput {
     @location(3) @interpolate(flat) instance_index: u32,
 }
 
-// Octahedral encoding: maps unit normal to [-1,1]^2
-fn oct_encode(n: vec3f) -> vec2f {
-    let inv_l1 = 1.0 / (abs(n.x) + abs(n.y) + abs(n.z));
-    var p = n.xy * inv_l1;
-    // Fold lower hemisphere
-    if (n.z < 0.0) {
-        let s = vec2f(
-            select(-1.0, 1.0, p.x >= 0.0),
-            select(-1.0, 1.0, p.y >= 0.0)
-        );
-        p = (1.0 - abs(p.yx)) * s;
-    }
-    return p; // in [-1, 1]
-}
-
 @vertex
 fn vs_main(
     @builtin(vertex_index)   vertex_index:   u32,
@@ -73,8 +61,8 @@ fn vs_main(
 
     let world_pos    = obj.model * vec4f(p, 1.0);
     let clip_pos     = globals.view_proj * world_pos;
-    // Transform normal by inverse-transpose (upper-left 3×3 of inv_model^T)
-    let world_normal = normalize((obj.inv_model * vec4f(n, 0.0)).xyz);
+    // Transform normal: use model matrix (correct for uniform scale + rotation).
+    let world_normal = normalize((obj.model * vec4f(n, 0.0)).xyz);
 
     var out: VertexOutput;
     out.position       = clip_pos;
@@ -86,20 +74,40 @@ fn vs_main(
 }
 
 struct GBufOutput {
-    @location(0) albedo:     vec4f,  // rgba16float: material color
-    @location(1) normal_mat: vec4f,  // rgba16float: oct-normal XY in RG, mat_id/255 in B
+    @location(0)            albedo:     vec4f,  // rgba16float: material color
+    @location(1)            normal_mat: vec4f,  // rgba16float: oct-normal XY in RG, mat_id/255 in B
+    @builtin(frag_depth)    depth:      f32,    // corrected depth (sphere impostor)
 }
 
 @fragment
 fn fs_main(in: VertexOutput) -> GBufOutput {
-    let obj    = object_data.objects[in.instance_index];
-    let mat_id = f32(in.instance_index) / 255.0;
+    let obj      = object_data.objects[in.instance_index];
+    let obj_type = u32(obj.params.x);
+    let mat_id   = f32(in.instance_index) / 255.0;
+
+    var world_normal = normalize(in.world_normal);
+    var frag_depth   = in.position.z;  // default: hardware depth
+
+    // Sphere impostor: ray-sphere intersection for correct silhouette and normal.
+    if (obj_type == 1u) {
+        let sphere_center = obj.model[3].xyz;
+        let sphere_radius = length(obj.model[0].xyz);  // uniform scale in col0
+        let cam_pos = globals.camera_pos_time.xyz;
+        let rd = normalize(in.world_pos - cam_pos);
+        let isect = ray_sphere_intersection(cam_pos, rd, sphere_center, sphere_radius);
+        if (!isect.hit) { discard; }
+        let hit = cam_pos + rd * isect.t;
+        world_normal = normalize(hit - sphere_center);
+        // Reproject hit point to get correct clip-space depth.
+        let clip_hit = globals.view_proj * vec4f(hit, 1.0);
+        frag_depth   = clip_hit.z / clip_hit.w;
+    }
 
-    // Oct-encode world normal, remap [-1,1] → [0,1] for storage
-    let oct = oct_encode(normalize(in.world_normal)) * 0.5 + vec2f(0.5);
+    let oct = oct_encode_unorm(world_normal);
 
     var out: GBufOutput;
     out.albedo     = vec4f(in.color.rgb, 1.0);
     out.normal_mat = vec4f(oct.x, oct.y, mat_id, 0.0);
+    out.depth      = frag_depth;
     return out;
 }
diff --git a/cnn_v3/shaders/gbuf_shadow.wgsl b/cnn_v3/shaders/gbuf_shadow.wgsl
new file mode 100644
index 0000000..65ae1fa
--- /dev/null
+++ b/cnn_v3/shaders/gbuf_shadow.wgsl
@@ -0,0 +1,136 @@
+// G-buffer shadow raymarching shader for CNN v3
+// Pass 2: Reads depth from Pass 1, marches shadow rays toward lights,
+// outputs shadow factor (1.0=lit, 0.0=shadow) to RGBA8Unorm render target (.r).
+
+#include "common_uniforms"
+#include "camera_common"
+#include "math/sdf_shapes"
+#include "math/normal"
+#include "render/raymarching_id"
+
+@group(0) @binding(0) var<uniform>         globals:     GlobalUniforms;
+@group(0) @binding(1) var<storage, read>   object_data: ObjectsBuffer;
+@group(0) @binding(2) var                  depth_tex:   texture_depth_2d;
+@group(0) @binding(4) var                  normal_mat_tex: texture_2d<f32>;
+
+struct GBufLight {
+    direction: vec4f,  // xyz = toward light (world space, normalized)
+    color:     vec4f,  // rgb = color, a = intensity
+}
+struct GBufLightsUniforms {
+    lights: array<GBufLight, 2>,
+    params: vec4f,  // x = num_lights
+}
+@group(0) @binding(3) var<uniform> lights: GBufLightsUniforms;
+
+// ---- SDF scene (proxy box per object in local space) ----
+
+// Stub required by render/raymarching (shadow() / rayMarch() call df()).
+fn df(p: vec3f) -> f32 { return MAX_RAY_LENGTH; }
+
+// SDF of the full scene.
+// Sphere: direct world-space formula (exact, no matrix multiply).
+// Box/Torus/Plane: local-space transform + uniform-scale correction.
+fn dfWithID(p: vec3f) -> RayMarchResult {
+    var res: RayMarchResult;
+    res.distance     = MAX_RAY_LENGTH;
+    res.distance_max = MAX_RAY_LENGTH;
+    res.object_id    = 0.0;
+
+    let n = u32(globals.params.x);
+    for (var i = 0u; i < n; i++) {
+        let obj      = object_data.objects[i];
+        let obj_type = u32(obj.params.x);
+        var d: f32;
+        switch obj_type {
+            case 1u: {
+                // SPHERE: direct world-space SDF — avoids matrix multiply, exact.
+                let c = obj.model[3].xyz;
+                let r = length(obj.model[0].xyz);
+                d = length(p - c) - r;
+            }
+            case 2u: {
+                // PLANE
+                let lp = (obj.inv_model * vec4f(p, 1.0)).xyz;
+                d = sdPlane(lp, vec3f(0.0, 1.0, 0.0), obj.params.y);
+            }
+            case 3u: {
+                // TORUS
+                let lp    = (obj.inv_model * vec4f(p, 1.0)).xyz;
+                let scale = length(obj.model[0].xyz);
+                d = sdTorus(lp, vec2f(0.8, 0.2)) * scale;
+            }
+            default: {
+                // CUBE (0) + fallback — uniform scale assumed.
+                let lp    = (obj.inv_model * vec4f(p, 1.0)).xyz;
+                let scale = length(obj.model[0].xyz);
+                d = sdBox(lp, vec3f(1.0)) * scale;
+            }
+        }
+        if (d < res.distance) {
+            res.distance  = d;
+            res.object_id = f32(i + 1u);
+        }
+    }
+    return res;
+}
+
+// Soft shadow march (IQ formula). Returns 1=lit, 0=shadow.
+// No dmin/dmax bounds: in open space d grows large so 8*d/t >> 1, res stays 1 naturally.
+fn soft_shadow(ro: vec3f, rd: vec3f) -> f32 {
+    var t   = 0.001;
+    var res = 1.0;
+    for (var i = 0; i < 64; i++) {
+        let d = dfWithID(ro + rd * t).distance;
+        if (d < 0.0005) { return 0.0; }
+        res = min(res, 8.0 * d / t);
+        t  += d;
+    }
+    return clamp(res, 0.0, 1.0);
+}
+
+// ---- Vertex: fullscreen triangle ----
+
+@vertex
+fn vs_main(@builtin(vertex_index) vid: u32) -> @builtin(position) vec4f {
+    let x = f32((vid & 1u) << 2u) - 1.0;
+    let y = f32((vid & 2u) << 1u) - 1.0;
+    return vec4f(x, y, 0.0, 1.0);
+}
+
+// ---- Fragment: shadow factor per pixel ----
+
+@fragment
+fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
+    let depth = textureLoad(depth_tex, vec2i(pos.xy), 0);
+
+    // Sky / background: fully lit.
+    if (depth >= 1.0) {
+        return vec4f(1.0);
+    }
+
+    // Reconstruct world-space position from NDC + depth.
+    let res = globals.resolution;
+    let ndc = vec2f(
+        (pos.x / res.x) * 2.0 - 1.0,
+        1.0 - (pos.y / res.y) * 2.0
+    );
+    let clip  = globals.inv_view_proj * vec4f(ndc, depth, 1.0);
+    let world = clip.xyz / clip.w;
+
+    // Use rasterized surface normal for bias — correct for sphere impostors.
+    let nm       = textureLoad(normal_mat_tex, vec2i(pos.xy), 0);
+    let nor      = oct_decode_unorm(nm.rg);
+    let bias_pos = world + nor * 0.02;
+
+    // March shadow rays toward each light; take the darkest value.
+    var shadow_val  = 1.0;
+    let num_lights  = u32(lights.params.x);
+    for (var i = 0u; i < num_lights; i++) {
+        let ld = lights.lights[i].direction.xyz;
+        let s  = soft_shadow(bias_pos, ld);
+        shadow_val = min(shadow_val, s);
+    }
+
+    return vec4f(shadow_val, shadow_val, shadow_val, 1.0);
+}
diff --git a/cnn_v3/shaders/gbuf_view.wgsl b/cnn_v3/shaders/gbuf_view.wgsl
index f2ae085..6a812e6 100644
--- a/cnn_v3/shaders/gbuf_view.wgsl
+++ b/cnn_v3/shaders/gbuf_view.wgsl
@@ -7,7 +7,9 @@
 //   Row 1: ch4(nrm.y)  ch5(depth)  ch6(dzdx)   ch7(dzdy)
 //   Row 2: ch8(matid)  ch9(prv.r)  ch10(prv.g) ch11(prv.b)
 //   Row 3: ch12(m1.r)  ch13(m1.g)  ch14(m1.b)  ch15(m2.r)
-//   Row 4: ch16(m2.g)  ch17(m2.b)  ch18(shdw)  ch19(trns)
+//   Row 4: ch16(m2.g)  ch17(m2.b)  ch18(dif)   ch19(trns)
+
+#include "debug/debug_print"
 
 struct GBufViewUniforms { resolution: vec2f }
 
@@ -93,10 +95,33 @@ fn fs_main(@builtin(position) pos: vec4f) -> @location(0) vec4f {
         disp = clamp(v, 0.0, 1.0);
     }
 
-    // Albedo channels: tint for identification (ch0=red, ch1=green, ch2=blue)
-    if      (ch == 0u) { return vec4f(disp, 0.0,  0.0,  1.0); }
-    else if (ch == 1u) { return vec4f(0.0,  disp, 0.0,  1.0); }
-    else if (ch == 2u) { return vec4f(0.0,  0.0,  disp, 1.0); }
+    var out = vec4f(disp, disp, disp, 1.0);
 
-    return vec4f(disp, disp, disp, 1.0);
+    // Label at top-left of each tile
+    let tile_w = u.resolution.x / 4.0;
+    let tile_h = u.resolution.y / 5.0;
+    let origin = vec2f(f32(col) * tile_w + 4.0, f32(row) * tile_h + 4.0);
+    switch ch {
+        case  0u: { out = debug_str(out, pos.xy, origin, vec4u(0x616C622Eu, 0x72000000u, 0u, 0u), 5u); } // alb.r
+        case  1u: { out = debug_str(out, pos.xy, origin, vec4u(0x616C622Eu, 0x67000000u, 0u, 0u), 5u); } // alb.g
+        case  2u: { out = debug_str(out, pos.xy, origin, vec4u(0x616C622Eu, 0x62000000u, 0u, 0u), 5u); } // alb.b
+        case  3u: { out = debug_str(out, pos.xy, origin, vec4u(0x6E726D2Eu, 0x78000000u, 0u, 0u), 5u); } // nrm.x
+        case  4u: { out = debug_str(out, pos.xy, origin, vec4u(0x6E726D2Eu, 0x79000000u, 0u, 0u), 5u); } // nrm.y
+        case  5u: { out = debug_str(out, pos.xy, origin, vec4u(0x64657074u, 0x68000000u, 0u, 0u), 5u); } // depth
+        case  6u: { out = debug_str(out, pos.xy, origin, vec4u(0x647A6478u, 0u, 0u, 0u), 4u); }          // dzdx
+        case  7u: { out = debug_str(out, pos.xy, origin, vec4u(0x647A6479u, 0u, 0u, 0u), 4u); }          // dzdy
+        case  8u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D617469u, 0x64000000u, 0u, 0u), 5u); } // matid
+        case  9u: { out = debug_str(out, pos.xy, origin, vec4u(0x7072762Eu, 0x72000000u, 0u, 0u), 5u); } // prv.r
+        case 10u: { out = debug_str(out, pos.xy, origin, vec4u(0x7072762Eu, 0x67000000u, 0u, 0u), 5u); } // prv.g
+        case 11u: { out = debug_str(out, pos.xy, origin, vec4u(0x7072762Eu, 0x62000000u, 0u, 0u), 5u); } // prv.b
+        case 12u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D312E72u, 0u, 0u, 0u), 4u); }          // m1.r
+        case 13u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D312E67u, 0u, 0u, 0u), 4u); }          // m1.g
+        case 14u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D312E62u, 0u, 0u, 0u), 4u); }          // m1.b
+        case 15u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E72u, 0u, 0u, 0u), 4u); }          // m2.r
+        case 16u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E67u, 0u, 0u, 0u), 4u); }          // m2.g
+        case 17u: { out = debug_str(out, pos.xy, origin, vec4u(0x6D322E62u, 0u, 0u, 0u), 4u); }          // m2.b
+        case 18u: { out = debug_str(out, pos.xy, origin, vec4u(0x64696600u, 0u, 0u, 0u), 3u); }          // dif
+        default:  { out = debug_str(out, pos.xy, origin, vec4u(0x74726E73u, 0u, 0u, 0u), 4u); }          // trns
+    }
+    return out;
 }