diff options
Diffstat (limited to 'cnn_v3/training/infer_cnn_v3.py')
| -rw-r--r-- | cnn_v3/training/infer_cnn_v3.py | 219 |
1 files changed, 219 insertions, 0 deletions
diff --git a/cnn_v3/training/infer_cnn_v3.py b/cnn_v3/training/infer_cnn_v3.py new file mode 100644 index 0000000..ca1c72a --- /dev/null +++ b/cnn_v3/training/infer_cnn_v3.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = ["torch", "numpy", "pillow", "opencv-python"] +# /// +"""CNN v3 PyTorch inference — compare with cnn_test (WGSL/GPU output). + +Simple mode (single PNG): albedo = photo, geometry channels zeroed. +Full mode (sample dir): loads all G-buffer files via assemble_features. + +Usage: + python3 infer_cnn_v3.py photo.png out.png --checkpoint checkpoints/ckpt.pth + python3 infer_cnn_v3.py sample_000/ out.png --checkpoint ckpt.pth + python3 infer_cnn_v3.py photo.png out.png --checkpoint ckpt.pth --identity-film + python3 infer_cnn_v3.py photo.png out.png --checkpoint ckpt.pth --cond 0.5 0.0 0.8 0.0 0.0 +""" + +import argparse +import sys +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image + +sys.path.insert(0, str(Path(__file__).parent)) +from train_cnn_v3 import CNNv3 +from cnn_v3_utils import assemble_features, load_rgb, load_rg, load_depth16, load_gray + + +# --------------------------------------------------------------------------- +# Feature loading +# --------------------------------------------------------------------------- + +def load_sample_dir(sample_dir: Path) -> np.ndarray: + """Load all G-buffer files from a sample directory → (H,W,20) f32.""" + return assemble_features( + load_rgb(sample_dir / 'albedo.png'), + load_rg(sample_dir / 'normal.png'), + load_depth16(sample_dir / 'depth.png'), + load_gray(sample_dir / 'matid.png'), + load_gray(sample_dir / 'shadow.png'), + load_gray(sample_dir / 'transp.png'), + ) + + +def load_simple(image_path: Path) -> np.ndarray: + """Photo → (H,W,20) f32 with geometry channels zeroed. + + normal=(0.5,0.5) is the oct-encoded "no normal" (decodes to ~(0,0,1)). + shadow=1.0 (fully lit), transp=0.0 (opaque). + """ + albedo = load_rgb(image_path) + h, w = albedo.shape[:2] + normal = np.full((h, w, 2), 0.5, dtype=np.float32) + depth = np.zeros((h, w), dtype=np.float32) + matid = np.zeros((h, w), dtype=np.float32) + shadow = np.ones((h, w), dtype=np.float32) + transp = np.zeros((h, w), dtype=np.float32) + return assemble_features(albedo, normal, depth, matid, shadow, transp) + + +# --------------------------------------------------------------------------- +# Inference +# --------------------------------------------------------------------------- + +def pad_to_multiple(feat: np.ndarray, m: int = 4) -> tuple: + """Pad (H,W,C) so H and W are multiples of m. Returns (padded, (ph, pw)).""" + h, w = feat.shape[:2] + ph = (m - h % m) % m + pw = (m - w % m) % m + if ph == 0 and pw == 0: + return feat, (0, 0) + return np.pad(feat, ((0, ph), (0, pw), (0, 0))), (ph, pw) + + +def run_identity_film(model: CNNv3, feat: torch.Tensor) -> torch.Tensor: + """Forward with identity FiLM (γ=1, β=0). Matches C++ cnn_test default.""" + c0, c1 = model.enc_channels + B = feat.shape[0] + dev = feat.device + + skip0 = F.relu(model.enc0(feat)) + + x = F.avg_pool2d(skip0, 2) + skip1 = F.relu(model.enc1(x)) + + x = F.relu(model.bottleneck(F.avg_pool2d(skip1, 2))) + + x = F.relu(model.dec1( + torch.cat([F.interpolate(x, scale_factor=2, mode='nearest'), skip1], dim=1) + )) + + x = F.relu(model.dec0( + torch.cat([F.interpolate(x, scale_factor=2, mode='nearest'), skip0], dim=1) + )) + + return torch.sigmoid(x) + + +# --------------------------------------------------------------------------- +# Output helpers +# --------------------------------------------------------------------------- + +def save_png(path: Path, out: np.ndarray) -> None: + """Save (H,W,4) f32 [0,1] RGBA as PNG.""" + rgba8 = (np.clip(out, 0.0, 1.0) * 255.0 + 0.5).astype(np.uint8) + Image.fromarray(rgba8, 'RGBA').save(path) + + +def print_debug_hex(out: np.ndarray, n: int = 8) -> None: + """Print first n pixels as hex RGBA + float values.""" + flat = out.reshape(-1, 4) + for i in range(min(n, flat.shape[0])): + r, g, b, a = flat[i] + ri, gi, bi, ai = int(r*255+.5), int(g*255+.5), int(b*255+.5), int(a*255+.5) + print(f' [{i}] 0x{ri:02X}{gi:02X}{bi:02X}{ai:02X}' + f' ({r:.4f} {g:.4f} {b:.4f} {a:.4f})') + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + p = argparse.ArgumentParser(description='CNN v3 PyTorch inference') + p.add_argument('input', help='Input PNG or sample directory') + p.add_argument('output', help='Output PNG') + p.add_argument('--checkpoint', '-c', metavar='CKPT', + help='Path to .pth checkpoint (auto-finds latest if omitted)') + p.add_argument('--enc-channels', default='4,8', + help='Encoder channels (default: 4,8 — must match checkpoint)') + p.add_argument('--cond', nargs=5, type=float, metavar='F', default=[0.0]*5, + help='FiLM conditioning: 5 floats (beat_phase beat_norm audio style0 style1)') + p.add_argument('--identity-film', action='store_true', + help='Bypass FiLM MLP, use γ=1 β=0 (matches C++ cnn_test default)') + p.add_argument('--blend', type=float, default=1.0, + help='Blend with input albedo: 0=input 1=CNN (default 1.0)') + p.add_argument('--debug-hex', action='store_true', + help='Print first 8 output pixels as hex') + args = p.parse_args() + + # --- Feature loading --- + inp = Path(args.input) + if inp.is_dir(): + print(f'Mode: full ({inp})') + feat = load_sample_dir(inp) + albedo_rgb = load_rgb(inp / 'albedo.png') + else: + print(f'Mode: simple ({inp})') + feat = load_simple(inp) + albedo_rgb = load_rgb(inp) + orig_h, orig_w = feat.shape[:2] + + feat_padded, (ph, pw) = pad_to_multiple(feat, 4) + H, W = feat_padded.shape[:2] + if ph or pw: + print(f'Padded {orig_w}×{orig_h} → {W}×{H}') + else: + print(f'Resolution: {W}×{H}') + + # --- Load checkpoint --- + if args.checkpoint: + ckpt_path = Path(args.checkpoint) + else: + ckpts = sorted(Path('checkpoints').glob('checkpoint_epoch_*.pth'), + key=lambda f: int(f.stem.split('_')[-1])) + if not ckpts: + print('Error: no checkpoint found; use --checkpoint', file=sys.stderr) + sys.exit(1) + ckpt_path = ckpts[-1] + print(f'Checkpoint: {ckpt_path}') + + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False) + cfg = ckpt.get('config', {}) + enc_channels = cfg.get('enc_channels', [int(c) for c in args.enc_channels.split(',')]) + film_cond_dim = cfg.get('film_cond_dim', 5) + print(f'Architecture: enc={enc_channels} film_cond_dim={film_cond_dim}') + + model = CNNv3(enc_channels=enc_channels, film_cond_dim=film_cond_dim) + model.load_state_dict(ckpt['model_state_dict']) + model.eval() + + # --- Inference --- + feat_t = torch.from_numpy(feat_padded).permute(2, 0, 1).unsqueeze(0) # (1,20,H,W) + cond_t = torch.tensor([args.cond], dtype=torch.float32) # (1,5) + + with torch.no_grad(): + if args.identity_film: + print('FiLM: identity (γ=1, β=0)') + out_t = run_identity_film(model, feat_t) + else: + print(f'FiLM cond: {args.cond}') + out_t = model(feat_t, cond_t) + + # (1,4,H,W) → crop padding → (orig_h, orig_w, 4) + out = out_t[0].permute(1, 2, 0).numpy()[:orig_h, :orig_w, :] + + # Optional blend with albedo + if args.blend < 1.0: + h_in, w_in = albedo_rgb.shape[:2] + ab = albedo_rgb[:orig_h, :orig_w] + ones = np.ones((orig_h, orig_w, 1), dtype=np.float32) + src_rgba = np.concatenate([ab, ones], axis=-1) + out = src_rgba * (1.0 - args.blend) + out * args.blend + + # --- Save --- + out_path = Path(args.output) + save_png(out_path, out) + print(f'Saved: {out_path}') + + if args.debug_hex: + print('First 8 output pixels (RGBA):') + print_debug_hex(out) + + +if __name__ == '__main__': + main() |
