2 files changed, 373 insertions, 0 deletions
diff --git a/src/gpu/sequence.h b/src/gpu/sequence.h
index 4592082..04482fb 100644
--- a/src/gpu/sequence.h
+++ b/src/gpu/sequence.h
@@ -71,6 +71,9 @@ class NodeRegistry {
 
   void set_external_view(const std::string& name, WGPUTextureView view);
 
+  int default_width()  const { return default_width_; }
+  int default_height() const { return default_height_; }
+
  private:
   WGPUDevice device_;
   int default_width_;
diff --git a/src/tests/gpu/test_cnn_v3_parity.cc b/src/tests/gpu/test_cnn_v3_parity.cc
new file mode 100644
index 0000000..608decb
--- /dev/null
+++ b/src/tests/gpu/test_cnn_v3_parity.cc
@@ -0,0 +1,370 @@
+// CNN v3 parity test: validates WGSL shaders against Python reference.
+// Two checks:
+//   1. Zero-weight test (deterministic): output must be sigmoid(0) = 0.5
+//   2. Random-weight test: output must match Python-generated test vectors
+//      (within 1/255 per pixel)
+
+#include "../common/webgpu_test_fixture.h"
+#include "cnn_v3/src/cnn_v3_effect.h"
+#include "gpu/sequence.h"
+#include "../../cnn_v3/test_vectors.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// fp16 decode (matches GPU read)
+// ---------------------------------------------------------------------------
+
+static float fp16_bits_to_f32(uint16_t h) {
+  uint32_t sign = (h & 0x8000u) << 16;
+  uint32_t exp  = (h & 0x7C00u) >> 10;
+  uint32_t mant = (h & 0x03FFu);
+  if (exp == 0 && mant == 0) {
+    float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r;
+  }
+  if (exp == 31) {
+    uint32_t b = sign | 0x7F800000u | (mant << 13);
+    float r; __builtin_memcpy(&r, &b, 4); return r;
+  }
+  uint32_t b = sign | ((exp + 112) << 23) | (mant << 13);
+  float r; __builtin_memcpy(&r, &b, 4); return r;
+}
+
+// ---------------------------------------------------------------------------
+// Raw RGBA16Float readback → flat array of f32 (one per channel per pixel)
+// ---------------------------------------------------------------------------
+
+struct MapState { bool done = false; WGPUMapAsyncStatus status; };
+
+static std::vector<float> readback_rgba16float(WGPUDevice device,
+                                                WGPUQueue queue,
+                                                WGPUTexture tex,
+                                                int W, int H) {
+  const uint32_t bytes_per_px     = 8;  // 4 × f16
+  const uint32_t unaligned_bpr   = (uint32_t)(W * bytes_per_px);
+  const uint32_t aligned_bpr     = ((unaligned_bpr + 255u) / 256u) * 256u;
+  const size_t   buf_size        = aligned_bpr * (size_t)H;
+
+  WGPUBufferDescriptor bd = {};
+  bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+  bd.size  = buf_size;
+  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd);
+
+  WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr);
+  WGPUTexelCopyTextureInfo src = {};
+  src.texture = tex;
+  WGPUTexelCopyBufferInfo dst = {};
+  dst.buffer = staging;
+  dst.layout.bytesPerRow  = aligned_bpr;
+  dst.layout.rowsPerImage = (uint32_t)H;
+  WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 };
+  wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent);
+  WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+  wgpuQueueSubmit(queue, 1, &cmds);
+  wgpuCommandBufferRelease(cmds);
+  wgpuCommandEncoderRelease(enc);
+  wgpuDevicePoll(device, true, nullptr);
+
+  MapState ms = {};
+  WGPUBufferMapCallbackInfo mi = {};
+  mi.mode = WGPUCallbackMode_AllowProcessEvents;
+  mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) {
+    auto* st = (MapState*)u;
+    st->status = s; st->done = true;
+  };
+  mi.userdata1 = &ms;
+  wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi);
+  for (int i = 0; i < 100 && !ms.done; ++i)
+    wgpuDevicePoll(device, true, nullptr);
+
+  std::vector<float> result(W * H * 4, 0.0f);
+  if (ms.done && ms.status == WGPUMapAsyncStatus_Success) {
+    const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange(
+        staging, 0, buf_size);
+    if (mapped) {
+      for (int y = 0; y < H; ++y) {
+        const uint16_t* row =
+            (const uint16_t*)(mapped + (size_t)y * aligned_bpr);
+        for (int x = 0; x < W; ++x) {
+          for (int c = 0; c < 4; ++c) {
+            result[(y * W + x) * 4 + c] =
+                fp16_bits_to_f32(row[x * 4 + c]);
+          }
+        }
+      }
+    }
+  }
+  wgpuBufferUnmap(staging);
+  wgpuBufferRelease(staging);
+  return result;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: create rgba32uint texture with TextureBinding | CopyDst
+// ---------------------------------------------------------------------------
+
+static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) {
+  WGPUTextureDescriptor d = {};
+  d.format        = WGPUTextureFormat_RGBA32Uint;
+  d.usage         = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst;
+  d.dimension     = WGPUTextureDimension_2D;
+  d.size          = { (uint32_t)W, (uint32_t)H, 1 };
+  d.mipLevelCount = 1;
+  d.sampleCount   = 1;
+  return wgpuDeviceCreateTexture(dev, &d);
+}
+
+static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) {
+  WGPUTextureDescriptor d = {};
+  d.format        = WGPUTextureFormat_RGBA16Float;
+  d.usage         = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc;
+  d.dimension     = WGPUTextureDimension_2D;
+  d.size          = { (uint32_t)W, (uint32_t)H, 1 };
+  d.mipLevelCount = 1;
+  d.sampleCount   = 1;
+  return wgpuDeviceCreateTexture(dev, &d);
+}
+
+static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) {
+  WGPUTextureViewDescriptor d = {};
+  d.format          = fmt;
+  d.dimension       = WGPUTextureViewDimension_2D;
+  d.mipLevelCount   = 1;
+  d.arrayLayerCount = 1;
+  return wgpuTextureCreateView(tex, &d);
+}
+
+// ---------------------------------------------------------------------------
+// Run one CNN v3 forward pass and return output pixels
+// ---------------------------------------------------------------------------
+
+static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture,
+                                      int W, int H,
+                                      const uint32_t* feat0_u32,   // W*H*4
+                                      const uint32_t* feat1_u32,   // W*H*4
+                                      const uint32_t* weights_u32, // (TOTAL_F16+1)/2
+                                      uint32_t weights_bytes,
+                                      std::vector<float>* enc0_out = nullptr,
+                                      std::vector<float>* dec1_out = nullptr) {
+  GpuContext ctx = fixture.ctx();
+
+  // Create input textures manually (with CopyDst for upload)
+  WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H);
+  WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H);
+  WGPUTexture out_tex   = make_output_tex(ctx.device, W, H);
+
+  WGPUTextureView feat0_view =
+      make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint);
+  WGPUTextureView feat1_view =
+      make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint);
+  WGPUTextureView out_view =
+      make_view(out_tex, WGPUTextureFormat_RGBA16Float);
+
+  // Upload feat texture data
+  auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) {
+    WGPUTexelCopyTextureInfo dst_tex = {};
+    dst_tex.texture = tex;
+    WGPUTexelCopyBufferLayout layout = {};
+    layout.bytesPerRow  = (uint32_t)(W * 16);  // 4 u32 per pixel
+    layout.rowsPerImage = (uint32_t)H;
+    WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 };
+    wgpuQueueWriteTexture(ctx.queue, &dst_tex, data,
+                          (size_t)(W * H * 16), &layout, &ext);
+  };
+  upload_tex(feat0_tex, feat0_u32);
+  upload_tex(feat1_tex, feat1_u32);
+
+  // Wire into NodeRegistry via external views
+  NodeRegistry registry(ctx.device, W, H);
+  registry.set_external_view("feat0", feat0_view);
+  registry.set_external_view("feat1", feat1_view);
+  registry.set_external_view("cnn3_out", out_view);
+
+  CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn3_out"}, 0.0f, 1000.0f);
+  effect.declare_nodes(registry);
+
+  if (weights_u32) {
+    effect.upload_weights(ctx.queue, weights_u32, weights_bytes);
+  }
+
+  // Run 5 compute passes
+  WGPUCommandEncoder enc =
+      wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+  UniformsSequenceParams params = {};
+  params.resolution   = { (float)W, (float)H };
+  params.aspect_ratio = 1.0f;
+  effect.render(enc, params, registry);
+
+  WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+  wgpuQueueSubmit(ctx.queue, 1, &cmds);
+  wgpuCommandBufferRelease(cmds);
+  wgpuCommandEncoderRelease(enc);
+  wgpuDevicePoll(ctx.device, true, nullptr);
+
+  // Read back output
+  auto pixels = readback_rgba16float(ctx.device, ctx.queue, out_tex, W, H);
+
+  // Optional: read back intermediate layers
+  if (enc0_out) {
+    WGPUTexture enc0_tex = registry.get_texture("cnn3_out_enc0");
+    *enc0_out = readback_rgba16float(ctx.device, ctx.queue, enc0_tex, W, H);
+  }
+  if (dec1_out) {
+    WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1");
+    // dec1 is rgba16float written at half-res (W/2, H/2) — read only valid region
+    *dec1_out = readback_rgba16float(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2);
+  }
+
+  // Cleanup
+  wgpuTextureViewRelease(feat0_view);
+  wgpuTextureViewRelease(feat1_view);
+  wgpuTextureViewRelease(out_view);
+  wgpuTextureRelease(feat0_tex);
+  wgpuTextureRelease(feat1_tex);
+  wgpuTextureRelease(out_tex);
+
+  return pixels;
+}
+
+extern void InitShaderComposer();
+
+// ---------------------------------------------------------------------------
+// Test 1: zero weights → sigmoid(ReLU(0)) = 0.5 for all pixels/channels
+// ---------------------------------------------------------------------------
+
+static int test_zero_weights() {
+  fprintf(stdout, "  [cnn_v3_parity] test_zero_weights...\n");
+
+  WebGPUTestFixture fixture;
+  if (!fixture.init()) {
+    fprintf(stdout, "    ⚠ WebGPU unavailable — skip\n");
+    return 1;
+  }
+  InitShaderComposer();
+
+  const int W = 8, H = 8;
+  std::vector<uint32_t> feat0(W * H * 4, 0u);
+  std::vector<uint32_t> feat1(W * H * 4, 0u);
+
+  auto pixels = run_cnn_v3(fixture, W, H,
+                             feat0.data(), feat1.data(),
+                             nullptr, 0);  // null = zero weights (default)
+
+  // Expected: sigmoid(0) = 0.5 exactly
+  const float expected = 0.5f;
+  const float tol = 1.0f / 255.0f;
+  float max_err = 0.0f;
+  for (float v : pixels)
+    max_err = fmaxf(max_err, fabsf(v - expected));
+
+  if (max_err > tol) {
+    fprintf(stderr, "    ✗ zero_weights: max_err=%.5f > %.5f\n", max_err, tol);
+    return 0;
+  }
+  fprintf(stdout, "    ✓ zero_weights: max_err=%.2e  OK\n", max_err);
+  return 1;
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: random weights — compare to Python reference test vectors
+// ---------------------------------------------------------------------------
+
+static int test_random_weights() {
+  fprintf(stdout, "  [cnn_v3_parity] test_random_weights (seed=42)...\n");
+
+  WebGPUTestFixture fixture;
+  if (!fixture.init()) {
+    fprintf(stdout, "    ⚠ WebGPU unavailable — skip\n");
+    return 1;
+  }
+  InitShaderComposer();
+
+  const int W = kCnnV3TestW, H = kCnnV3TestH;
+  const uint32_t weights_bytes =
+      (uint32_t)sizeof(kCnnV3TestWeightsU32);
+
+  std::vector<float> enc0_pixels, dec1_pixels;
+  auto pixels = run_cnn_v3(fixture, W, H,
+                             kCnnV3TestFeat0U32, kCnnV3TestFeat1U32,
+                             kCnnV3TestWeightsU32, weights_bytes,
+                             &enc0_pixels, &dec1_pixels);
+
+  // Check enc0 layer first
+  const float tol = 1.0f / 255.0f;
+  float enc0_max_err = 0.0f;
+  int enc0_worst = -1;
+  for (int i = 0; i < W * H * 4; ++i) {
+    float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]);
+    float err = fabsf(enc0_pixels[i] - ref);
+    if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; }
+  }
+  if (enc0_max_err > tol) {
+    int px = enc0_worst / 4, ch = enc0_worst % 4;
+    fprintf(stderr, "    ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d"
+            " gpu=%.5f ref=%.5f\n",
+            enc0_max_err, tol, px, ch,
+            enc0_pixels[enc0_worst],
+            fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst]));
+  } else {
+    fprintf(stdout, "    ✓ enc0: max_err=%.2e  OK\n", enc0_max_err);
+  }
+
+  // Check dec1 layer (half-res: W/2 x H/2 x 4)
+  float dec1_max_err = 0.0f;
+  int dec1_worst = -1;
+  int dec1_n = (W / 2) * (H / 2) * 4;
+  for (int i = 0; i < dec1_n; ++i) {
+    float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]);
+    float err = fabsf(dec1_pixels[i] - ref);
+    if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; }
+  }
+  if (dec1_max_err > tol) {
+    int px = dec1_worst / 4, ch = dec1_worst % 4;
+    fprintf(stderr, "    ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d"
+            " gpu=%.5f ref=%.5f\n",
+            dec1_max_err, tol, px, ch,
+            dec1_pixels[dec1_worst],
+            fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst]));
+  } else {
+    fprintf(stdout, "    ✓ dec1: max_err=%.2e  OK\n", dec1_max_err);
+  }
+
+  // Compare final output with Python reference (1/255 tolerance)
+  float max_err = 0.0f;
+  int worst = -1;
+  int n = W * H * 4;
+  for (int i = 0; i < n; ++i) {
+    float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]);
+    float err = fabsf(pixels[i] - ref);
+    if (err > max_err) { max_err = err; worst = i; }
+  }
+
+  if (max_err > tol) {
+    int px = worst / 4, ch = worst % 4;
+    fprintf(stderr, "    ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d"
+            " gpu=%.5f ref=%.5f\n",
+            max_err, tol, px, ch,
+            pixels[worst],
+            fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst]));
+    return 0;
+  }
+  fprintf(stdout, "    ✓ random_weights: max_err=%.2e  OK\n", max_err);
+  return 1;
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+int main() {
+  int pass = 0, total = 0;
+
+  ++total; pass += test_zero_weights();
+  ++total; pass += test_random_weights();
+
+  fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total);
+  return (pass == total) ? 0 : 1;
+}