summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--PROJECT_CONTEXT.md6
-rw-r--r--TODO.md3
-rw-r--r--cmake/DemoTests.cmake18
-rw-r--r--cnn_v3/src/cnn_v3_effect.cc20
-rw-r--r--cnn_v3/src/cnn_v3_effect.h4
-rw-r--r--cnn_v3/test_vectors.h293
-rw-r--r--cnn_v3/training/gen_test_vectors.py451
-rw-r--r--doc/COMPLETED.md2
-rw-r--r--src/gpu/sequence.h3
-rw-r--r--src/tests/gpu/test_cnn_v3_parity.cc370
10 files changed, 1160 insertions, 10 deletions
diff --git a/PROJECT_CONTEXT.md b/PROJECT_CONTEXT.md
index 4767185..6219275 100644
--- a/PROJECT_CONTEXT.md
+++ b/PROJECT_CONTEXT.md
@@ -36,17 +36,17 @@
- **Audio:** Sample-accurate sync. Zero heap allocations per frame. Variable tempo. OLA-IDCT synthesis (v2 .spec): Hann analysis window, rectangular synthesis, 50% overlap, click-free. V1 (raw DCT-512) preserved for generated notes. .spec files regenerated as v2.
- **Shaders:** Parameterized effects (UniformHelper, .seq syntax). Beat-synchronized animation support (`beat_time`, `beat_phase`). Modular WGSL composition with ShaderComposer. 27 shared common shaders (math, render, compute). Reusable snippets: `render/scratch_lines`, `render/ntsc_common` (NTSC signal processing, RGB and YIQ input variants via `sample_ntsc_signal` hook), `math/color` (YIQ/NTSC), `math/color_c64` (C64 palette, Bayer dither, border animation).
- **3D:** Hybrid SDF/rasterization with BVH. Binary scene loader. Blender pipeline.
-- **Effects:** CNN post-processing: CNNEffect (v1) and CNNv2Effect operational. CNN v2: sigmoid activation, storage buffer weights (~3.2 KB), 7D static features, dynamic layers. Training stable, convergence validated. **CNN v3 Phase 4 complete:** `CNNv3Effect` C++ class (5 compute passes, FiLM uniform upload, identity γ/β defaults). `set_film_params()` modulates all layers via beat/audio. WGSL params struct alignment fix (vec3u align=16 → 64/96-byte C++ mirrors). Registered in CMake, shaders.h/cc, demo_effects.h, tests. See `cnn_v3/docs/HOWTO.md`.
+- **Effects:** CNN post-processing: CNNEffect (v1) and CNNv2Effect operational. CNN v2: sigmoid activation, storage buffer weights (~3.2 KB), 7D static features, dynamic layers. Training stable, convergence validated. **CNN v3 Phases 1–5 complete:** `CNNv3Effect` C++ class (5 compute passes, FiLM uniform upload, identity γ/β defaults). Parity validated: enc0 max_err=1.95e-3, dec1 max_err=1.95e-3, final max_err=4.88e-4 (all ≤1/255). Key fix: intermediate nodes declared at fractional resolutions (W/2, W/4) via `NodeRegistry::default_width()/default_height()`. See `cnn_v3/docs/HOWTO.md`.
- **Tools:** CNN test tool operational. Texture readback utility functional. Timeline editor (web-based, beat-aligned, audio playback).
- **Build:** Asset dependency tracking. Size measurement. Hot-reload (debug-only). WSL (Windows 10) supported: native Linux build and cross-compile to `.exe` via `mingw-w64`.
- **Sequence:** DAG-based effect routing with explicit node system. Python compiler with topological sort and ping-pong optimization. 12 effects operational (Passthrough, Placeholder, GaussianBlur, Heptagon, Particles, RotatingCube, Hybrid3D, Flash, PeakMeter, Scene1, Scene2, Scratch). Effect times are absolute (seq_compiler adds sequence start offset). See `doc/SEQUENCE.md`.
-- **Testing:** **35/35 passing**.
+- **Testing:** **36/36 passing**.
---
## Next Up
-**Active:** CNN v3 Phase 5 (parity validation), Spectral Brush Editor
+**Active:** CNN v3 training (`train_cnn_v3.py`), Spectral Brush Editor
**Ongoing:** Test infrastructure maintenance (35/35 passing)
**Future:** Size optimization (64k target), 3D enhancements
diff --git a/TODO.md b/TODO.md
index 11c629e..559f8b3 100644
--- a/TODO.md
+++ b/TODO.md
@@ -76,7 +76,8 @@ PyTorch / HTML WebGPU / C++ WebGPU.
- Params alignment fix: WGSL `vec3u` align=16 → C++ structs 64/96 bytes
- Weight offsets as explicit formulas (e.g. `20*4*9+4`)
- FiLM γ/β: identity defaults; real values require trained MLP (see below)
-5. Parity validation (test vectors, ≤1/255 per pixel)
+5. ✅ Parity validation: test vectors + `test_cnn_v3_parity.cc`. max_err=4.88e-4 (≤1/255).
+ - Key fix: intermediate nodes at fractional resolutions (W/2, W/4) via `NodeRegistry::default_width()/default_height()`
**FiLM MLP training** (blocks meaningful Phase 4 output):
- Needs `cnn_v3/training/train_cnn_v3.py` — not yet written
diff --git a/cmake/DemoTests.cmake b/cmake/DemoTests.cmake
index 0b7fbb7..69b9195 100644
--- a/cmake/DemoTests.cmake
+++ b/cmake/DemoTests.cmake
@@ -222,6 +222,24 @@ if(NOT DEMO_STRIP_ALL AND DEMO_WORKSPACE STREQUAL "main")
STB_IMAGE_WRITE_IMPLEMENTATION)
endif()
+# CNN v3 parity test (zero-weight + random-weight vs Python reference)
+if(NOT DEMO_STRIP_ALL AND DEMO_WORKSPACE STREQUAL "main")
+ add_demo_test(test_cnn_v3_parity CnnV3ParityTest gpu
+ src/tests/gpu/test_cnn_v3_parity.cc
+ src/tests/common/webgpu_test_fixture.cc
+ ${PLATFORM_SOURCES}
+ ${GEN_DEMO_CC})
+
+ target_include_directories(test_cnn_v3_parity PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}/cnn_v3/src)
+
+ target_link_libraries(test_cnn_v3_parity PRIVATE
+ gpu util procedural ${DEMO_LIBS})
+
+ demo_add_asset_deps(test_cnn_v3_parity shaders)
+endif()
+
# GPU Composite Texture Test (Phase 4)
add_demo_test(test_gpu_composite GpuCompositeTest gpu
src/tests/gpu/test_gpu_composite.cc
diff --git a/cnn_v3/src/cnn_v3_effect.cc b/cnn_v3/src/cnn_v3_effect.cc
index d13799c..92178f7 100644
--- a/cnn_v3/src/cnn_v3_effect.cc
+++ b/cnn_v3/src/cnn_v3_effect.cc
@@ -187,14 +187,17 @@ CNNv3Effect::CNNv3Effect(const GpuContext& ctx,
// ---------------------------------------------------------------------------
void CNNv3Effect::declare_nodes(NodeRegistry& registry) {
+ const int W = registry.default_width();
+ const int H = registry.default_height();
+
// enc0_tex: rgba16float full-res
- registry.declare_node(node_enc0_, NodeType::GBUF_ALBEDO, -1, -1);
- // enc1_tex: rgba32uint half-res
- registry.declare_node(node_enc1_, NodeType::GBUF_RGBA32UINT, -1, -1);
- // bottleneck_tex: rgba32uint quarter-res — declare at 1/4 resolution
- registry.declare_node(node_bottleneck_, NodeType::GBUF_RGBA32UINT, -1, -1);
+ registry.declare_node(node_enc0_, NodeType::GBUF_ALBEDO, W, H);
+ // enc1_tex: rgba32uint half-res — shaders use textureDimensions() for bounds
+ registry.declare_node(node_enc1_, NodeType::GBUF_RGBA32UINT, W / 2, H / 2);
+ // bottleneck_tex: rgba32uint quarter-res
+ registry.declare_node(node_bottleneck_, NodeType::GBUF_RGBA32UINT, W / 4, H / 4);
// dec1_tex: rgba16float half-res
- registry.declare_node(node_dec1_, NodeType::GBUF_ALBEDO, -1, -1);
+ registry.declare_node(node_dec1_, NodeType::GBUF_ALBEDO, W / 2, H / 2);
// output_tex: rgba16float full-res (the declared output_nodes_[0])
}
@@ -202,6 +205,11 @@ void CNNv3Effect::declare_nodes(NodeRegistry& registry) {
// set_film_params — simple linear mapping, no MLP yet
// ---------------------------------------------------------------------------
+void CNNv3Effect::upload_weights(WGPUQueue queue, const void* data,
+ uint32_t size_bytes) {
+ wgpuQueueWriteBuffer(queue, weights_buf_.buffer, 0, data, size_bytes);
+}
+
void CNNv3Effect::set_film_params(const CNNv3FiLMParams& fp) {
// Identity + audio/beat modulation.
// Replace with FiLM MLP output once training is done.
diff --git a/cnn_v3/src/cnn_v3_effect.h b/cnn_v3/src/cnn_v3_effect.h
index c358990..36e2797 100644
--- a/cnn_v3/src/cnn_v3_effect.h
+++ b/cnn_v3/src/cnn_v3_effect.h
@@ -89,6 +89,10 @@ class CNNv3Effect : public Effect {
// Update FiLM conditioning; call before render() each frame.
void set_film_params(const CNNv3FiLMParams& fp);
+ // Upload packed-f16 weights (kWeightsBufBytes bytes of u32 pairs).
+ // Used for testing and inference from trained .bin files.
+ void upload_weights(WGPUQueue queue, const void* data, uint32_t size_bytes);
+
private:
// Intermediate node names (prefixed from output[0])
std::string node_enc0_;
diff --git a/cnn_v3/test_vectors.h b/cnn_v3/test_vectors.h
new file mode 100644
index 0000000..6d1abc5
--- /dev/null
+++ b/cnn_v3/test_vectors.h
@@ -0,0 +1,293 @@
+// Auto-generated by cnn_v3/training/gen_test_vectors.py
+// Seed=42 W=8 H=8
+// DO NOT EDIT — regenerate with gen_test_vectors.py --header
+#pragma once
+#include <cstdint>
+
+static const int kCnnV3TestW = 8;
+static const int kCnnV3TestH = 8;
+
+// 256 u32 values
+static const uint32_t kCnnV3TestFeat0U32[256] = {
+ 0x2ccd39ebu, 0x3acb39d7u, 0x3814378fu, 0x3bc134ffu, 0x35e739ddu, 0x33073198u, 0x3b8a376cu, 0x32e339e0u,
+ 0x360d3ae0u, 0x33bc3ad0u, 0x3b6a38f3u, 0x398b3420u, 0x30d23be6u, 0x39652da8u, 0x2c8f3570u, 0x3a08379bu,
+ 0x355c3490u, 0x38293ac4u, 0x37243abeu, 0x39353ba0u, 0x3b152f6bu, 0x308837d4u, 0x398030e3u, 0x34962a10u,
+ 0x370f3079u, 0x382d36a1u, 0x281a3479u, 0x35fb38eau, 0x2ef43936u, 0x33f2230eu, 0x364e374eu, 0x360c3a7bu,
+ 0x38c0383eu, 0x381f2597u, 0x36be3584u, 0x3a432e6bu, 0x25b33b8au, 0x3a1c38d0u, 0x3a4d348fu, 0x3b6f390fu,
+ 0x296c3bd9u, 0x3860371eu, 0x356130b2u, 0x24283be7u, 0x3abe373du, 0x37ad352fu, 0x37993bd3u, 0x2a9f3031u,
+ 0x34413b90u, 0x2dce3808u, 0x3b7136c7u, 0x3bc53805u, 0x38093424u, 0x372c3ae0u, 0x3ad83479u, 0x383f363du,
+ 0x31f83bd0u, 0x27f434d3u, 0x32683645u, 0x31cd3971u, 0x34373966u, 0x359535afu, 0x377739bcu, 0x3ad235c8u,
+ 0x32d83893u, 0x357b3b33u, 0x37ea28fdu, 0x33a22fefu, 0x302f39fau, 0x3b7f3a75u, 0x39af38dau, 0x3bf139b5u,
+ 0x31363577u, 0x38443827u, 0x38e831b1u, 0x3b6c233bu, 0x2910343cu, 0x33b02eeeu, 0x28333462u, 0x322d3478u,
+ 0x362a360fu, 0x353f356du, 0x26742dbeu, 0x3a0e3278u, 0x3b6e3bedu, 0x38413809u, 0x3a313509u, 0x3ac13a1eu,
+ 0x36f33b2au, 0x3a743a23u, 0x3b6f34efu, 0x3bf42e0au, 0x2df83a29u, 0x28603940u, 0x3a653a29u, 0x3adb38d2u,
+ 0x346a2e44u, 0x296f36a0u, 0x343e372cu, 0x36cd3649u, 0x34533b09u, 0x36d13b26u, 0x3805353fu, 0x341e36afu,
+ 0x30dc3805u, 0x388735a2u, 0x3a97369du, 0x3bc2341cu, 0x3bbe3a47u, 0x308c3ab5u, 0x31703836u, 0x38ac3a8cu,
+ 0x3b703437u, 0x38832f5fu, 0x2b8839c5u, 0x3a8738c8u, 0x38192c52u, 0x394e3423u, 0x3b7f2f98u, 0x31f43b28u,
+ 0x38b3352cu, 0x371539bfu, 0x2eaa3100u, 0x37493c00u, 0x37b83afbu, 0x2d9e3b61u, 0x3b702f4cu, 0x35093b94u,
+ 0x373d35afu, 0x321536a9u, 0x340e3b30u, 0x2c4c39a4u, 0x393b28f6u, 0x393e356du, 0x3b992e04u, 0x3b0339fdu,
+ 0x351f305eu, 0x384c35e5u, 0x2bc334c0u, 0x341335e7u, 0x324d362du, 0x39043431u, 0x35873636u, 0x3a2d3845u,
+ 0x38b33610u, 0x382d3bbbu, 0x3a593b47u, 0x36de2b84u, 0x3be53996u, 0x2df03756u, 0x300d387fu, 0x38103a03u,
+ 0x3af439cau, 0x38e63908u, 0x3abd3a09u, 0x28aa3af4u, 0x32ec3873u, 0x39303ae2u, 0x320536b9u, 0x39a1356du,
+ 0x2dfd328au, 0x3a1d3b1bu, 0x34ad3265u, 0x39aa3bc7u, 0x34ec38e2u, 0x290f34c9u, 0x298739d4u, 0x39d61cf9u,
+ 0x3a0d3b97u, 0x37c7378cu, 0x353236fau, 0x36e6382cu, 0x3b2f38c9u, 0x2d0a3bf6u, 0x31c83628u, 0x349935a2u,
+ 0x3a1d3196u, 0x3b5b37f1u, 0x2c49282cu, 0x2d233674u, 0x3be33434u, 0x325732b0u, 0x37f83897u, 0x360738a5u,
+ 0x306f3a9du, 0x398536dbu, 0x35ea3af2u, 0x2c6d388bu, 0x2c6d3173u, 0x349c39d3u, 0x2c4039cau, 0x3aaf3ae6u,
+ 0x26152db1u, 0x3ad42b34u, 0x38633383u, 0x3a5d36d2u, 0x380137d3u, 0x30ce3beau, 0x2aa03aa5u, 0x3b1737a4u,
+ 0x397b3952u, 0x36b23437u, 0x382c35deu, 0x353b3765u, 0x340334e3u, 0x30cc35d7u, 0x38d13afau, 0x398d3048u,
+ 0x339a3ac8u, 0x206930d2u, 0x3a192a0cu, 0x29bf3be6u, 0x2c9939fcu, 0x3a0c38bdu, 0x219935bfu, 0x3bee38c3u,
+ 0x3210341fu, 0x38712feeu, 0x3a5738c6u, 0x3b243a06u, 0x33ea3a72u, 0x34c23872u, 0x3b753547u, 0x3bcc3975u,
+ 0x384d36acu, 0x2ede37cbu, 0x38393393u, 0x3b742c50u, 0x32562fedu, 0x2e343a1fu, 0x39ce3b34u, 0x39892c64u,
+ 0x3a0f390eu, 0x39bf3aa0u, 0x352938b0u, 0x3ba83994u, 0x395138b3u, 0x3a0d36feu, 0x31223bfbu, 0x3851327au,
+ 0x389337b4u, 0x36782a48u, 0x38ae38aau, 0x39c33942u, 0x3a523922u, 0x384d3900u, 0x2e7a38d9u, 0x3838345fu,
+ 0x396f3afcu, 0x38bd2dc9u, 0x39df3318u, 0x38bf3a9fu, 0x356b38bbu, 0x3aea3724u, 0x382839c9u, 0x2a7335e4u,
+};
+
+// 256 u32 values
+static const uint32_t kCnnV3TestFeat1U32[256] = {
+ 0xc863b415u, 0x249c220fu, 0x603452c6u, 0x00000000u, 0x316a194cu, 0x291db2cbu, 0x5f96105bu, 0x00000000u,
+ 0xeb343d39u, 0xf1b365e6u, 0x61b71b05u, 0x00000000u, 0x8151bb9eu, 0xfc56bec5u, 0x3c1e7c24u, 0x00000000u,
+ 0xf1d859a5u, 0x1b1270e5u, 0x39d19474u, 0x00000000u, 0x569b30dcu, 0x097e59b6u, 0xd0d3b912u, 0x00000000u,
+ 0xdafc8a80u, 0x6222c0d8u, 0xd61d6364u, 0x00000000u, 0xc5c2f0c4u, 0xcd28e9d7u, 0xcd7e12c4u, 0x00000000u,
+ 0x92cfbc01u, 0x1c5ebffdu, 0xec699bb5u, 0x00000000u, 0x9bd12023u, 0xe6b94175u, 0xf58751d1u, 0x00000000u,
+ 0x2fe9e259u, 0x66f28558u, 0x314748e3u, 0x00000000u, 0x0d0aabfcu, 0xf7666903u, 0xec5d90aau, 0x00000000u,
+ 0xee86a635u, 0xe237f413u, 0xa61606fcu, 0x00000000u, 0x85ab0fd7u, 0xfdd13bdbu, 0x8d6075e2u, 0x00000000u,
+ 0xa476623cu, 0x3634aa37u, 0xbf284477u, 0x00000000u, 0xd1c78653u, 0xadb3feedu, 0x7fa4408au, 0x00000000u,
+ 0x32a77b6au, 0x08ac3716u, 0xa0976732u, 0x00000000u, 0xaeda1174u, 0xc5ca1e59u, 0xf353b939u, 0x00000000u,
+ 0x7f53105cu, 0xd44334dfu, 0xb75edbe4u, 0x00000000u, 0x46f67512u, 0xd859d32du, 0x0da6b677u, 0x00000000u,
+ 0x9950dc38u, 0xf0badec3u, 0xa8b1d193u, 0x00000000u, 0xefe357bdu, 0x0e606587u, 0x884c5ed2u, 0x00000000u,
+ 0xc7d63411u, 0xa46ee9f4u, 0xe16ad66fu, 0x00000000u, 0x766cf523u, 0xaebf1396u, 0x6b75be3bu, 0x00000000u,
+ 0xdf433db5u, 0x1e942c35u, 0x410dffe5u, 0x00000000u, 0x18c4cc46u, 0xb3bcd975u, 0x3b94557eu, 0x00000000u,
+ 0x512fefb1u, 0xd62e1684u, 0x5c34ef2bu, 0x00000000u, 0x25554402u, 0x055e5375u, 0x3a08ec40u, 0x00000000u,
+ 0xea28d1a6u, 0x8c71f892u, 0xfead5d3du, 0x00000000u, 0x3712d6e9u, 0x59fa8772u, 0x29c7e9cdu, 0x00000000u,
+ 0x65fc32ecu, 0x90357e43u, 0xcee18a15u, 0x00000000u, 0x5e3b5c50u, 0xc583129du, 0xa04bf996u, 0x00000000u,
+ 0x4ab43782u, 0xe9864a08u, 0x6f2ab1c6u, 0x00000000u, 0x26a77c61u, 0xf673703cu, 0xe9d6c9cfu, 0x00000000u,
+ 0x0caebeeeu, 0xe709951fu, 0xf2875771u, 0x00000000u, 0xd43f1577u, 0x41477617u, 0xa19bf431u, 0x00000000u,
+ 0x89ca27c9u, 0x9ec1ee6cu, 0x9dcf44adu, 0x00000000u, 0xa3a370ddu, 0x83958e74u, 0xb0c45102u, 0x00000000u,
+ 0x86cfafcau, 0x04382d70u, 0x09083cf1u, 0x00000000u, 0xf5458e26u, 0xe8c4a35bu, 0x95ea20cbu, 0x00000000u,
+ 0x2cb1e624u, 0xc80e252fu, 0x24aeadb9u, 0x00000000u, 0x60958ae8u, 0x5471b135u, 0x032c76bcu, 0x00000000u,
+ 0xce983976u, 0x827df87du, 0x50f5f0adu, 0x00000000u, 0x81d7362fu, 0x00000e99u, 0x6fde87aeu, 0x00000000u,
+ 0x85033eb4u, 0x56f7b265u, 0xd493d37cu, 0x00000000u, 0x3ff49a3cu, 0x23487a39u, 0x870d2e4fu, 0x00000000u,
+ 0xe3249135u, 0x60123a68u, 0x0befa03du, 0x00000000u, 0xf84d74b5u, 0x71bd7da9u, 0x2c44f6cbu, 0x00000000u,
+ 0x9d98f068u, 0x51d59a46u, 0xf0131dceu, 0x00000000u, 0x4b40fe50u, 0x8cd5b0fbu, 0x8b164f67u, 0x00000000u,
+ 0x3e10a2d3u, 0x7fd0d4b7u, 0x1bec231fu, 0x00000000u, 0xa4cc2cd6u, 0xc22121ffu, 0xf33350e7u, 0x00000000u,
+ 0x536659b7u, 0x49043fc2u, 0x8c7ec0d7u, 0x00000000u, 0xb1597a41u, 0xfe1228f2u, 0x066908e4u, 0x00000000u,
+ 0x3d0194e7u, 0x432be415u, 0x4160b66fu, 0x00000000u, 0x76b6560au, 0xdf770ab8u, 0x07ef4642u, 0x00000000u,
+ 0xd0dafe5cu, 0x9e1f95f4u, 0x9d7dbecdu, 0x00000000u, 0xada5c397u, 0x1d8b6a84u, 0xbf29cf46u, 0x00000000u,
+ 0x3f858ef0u, 0x843e3a0cu, 0xad47e23fu, 0x00000000u, 0x9a9c1e18u, 0x52b851a8u, 0x65648845u, 0x00000000u,
+ 0x79fca3a8u, 0x0a8f8f09u, 0xb9dde8cbu, 0x00000000u, 0x199671dfu, 0x7565be28u, 0xa7add019u, 0x00000000u,
+ 0x14948e21u, 0xfedcb64du, 0x6091bc31u, 0x00000000u, 0x040bae5bu, 0xa89c3b59u, 0x8ebdcac3u, 0x00000000u,
+};
+
+// 982 u32 values
+static const uint32_t kCnnV3TestWeightsU32[982] = {
+ 0xa8b23143u, 0x2f9432e3u, 0x3491b3cbu, 0x317e3104u, 0xa79fb324u, 0x3419acf6u, 0x32322d86u, 0xb13da859u,
+ 0xb4302831u, 0x2d0e324au, 0xad9630f5u, 0x338c3485u, 0xb1dd3158u, 0xb461a51du, 0x2f07b2a3u, 0x347d30b3u,
+ 0xacf9aeb0u, 0xb1f6a4adu, 0xa377b31bu, 0x2e85b13eu, 0x3263a8d4u, 0xaf352fb1u, 0x31da3261u, 0xb010ac52u,
+ 0xb2eb2f02u, 0xb4bbb1c3u, 0x2e553182u, 0x31642fe1u, 0x2948a64fu, 0xb367b2eau, 0xa4712e77u, 0x31172903u,
+ 0x281d2d2cu, 0xaf87288cu, 0xa8dcb481u, 0xab06b17bu, 0xb11c32c9u, 0xb033b43eu, 0x2e38afedu, 0x31732861u,
+ 0xab312e4fu, 0xb2653207u, 0xb3dfb495u, 0xa5db3045u, 0x1123b281u, 0x2f8ab2adu, 0xac92a823u, 0x2d01af9fu,
+ 0xb3ebad4eu, 0x346fb356u, 0x2fab33d8u, 0x3481b07fu, 0x302a315au, 0xb05fa7c7u, 0x33bbb3c0u, 0xb1b7a6cbu,
+ 0x2a16af74u, 0x32d9b235u, 0x303730f7u, 0x2ce3a937u, 0x2dc12a75u, 0xaa77b3fbu, 0x9b62b467u, 0xb2d3ae89u,
+ 0x2abbb39du, 0x3415b253u, 0xade12a3au, 0xb4952afbu, 0xa1703467u, 0xb401316eu, 0x9db6a019u, 0x29823434u,
+ 0xb079a412u, 0x225aae78u, 0xb498a8b1u, 0x339b3244u, 0x2826b2e8u, 0x2e9db384u, 0x2e1fb033u, 0x3128305cu,
+ 0x33fdb388u, 0xb471b12eu, 0xacf52836u, 0x31eb3255u, 0x3459af06u, 0x20a0b004u, 0x3430b0afu, 0xb45eb271u,
+ 0x34baa8fcu, 0x30c63385u, 0x338e3381u, 0xaf1121cbu, 0x2e353139u, 0xb3c9acdau, 0xb09030bdu, 0xb0f93432u,
+ 0x325bb33eu, 0xb228b2a8u, 0x33312ba2u, 0xaf49b1d4u, 0x34883154u, 0xb2d60f49u, 0xb131b4abu, 0x2ed2b312u,
+ 0x1bc7b343u, 0x2a3b2f76u, 0x31d7b1c4u, 0x30973023u, 0xb339b315u, 0xabde341bu, 0x9f04afa5u, 0x34602e41u,
+ 0x3414b01au, 0x283db490u, 0xb3912d25u, 0xaa36b2e8u, 0x2b60347au, 0x31d83428u, 0x3178a503u, 0xb381b4a1u,
+ 0x31b33253u, 0x24bab122u, 0x33102c12u, 0xaab72bebu, 0xa9b1acd5u, 0x330e2dd6u, 0xb0d7a715u, 0x30b9b10eu,
+ 0xb3943214u, 0x2b41b429u, 0x323cb2cbu, 0xb2d6af48u, 0xb26c340bu, 0xb2a7b022u, 0xb499b362u, 0xb23fb445u,
+ 0x2b00b44au, 0xac162ef0u, 0x1990aefdu, 0x32be3333u, 0xb21db462u, 0xb0d0b10eu, 0xaa6e2978u, 0xacdab454u,
+ 0xb3a6234cu, 0xb44d3267u, 0xb3b23414u, 0x33bb3299u, 0x31cd349bu, 0x2d79315eu, 0xb304315bu, 0x205f258au,
+ 0xa5b732deu, 0x2d5cac6au, 0xb2ebb07cu, 0xaa62a2ccu, 0xad16b122u, 0xaea0ad21u, 0x2f22aca1u, 0x344fafcdu,
+ 0xa1dd33feu, 0x2571ae97u, 0x2ddc32b1u, 0x250731d8u, 0xb0112d1bu, 0xb1b73083u, 0x32ed2f7bu, 0x2c64b310u,
+ 0x3055b3c6u, 0x342fb3fau, 0x3468b2f6u, 0x2b3231c7u, 0x31ab316du, 0xb0bc3448u, 0xb3c62aebu, 0xb2502c76u,
+ 0x299028fdu, 0x22f4a53au, 0x31bf3111u, 0x2ba69cd2u, 0xb34d3424u, 0xb3eab35au, 0xaa402e10u, 0x2e933144u,
+ 0x33a6ae63u, 0xb068310au, 0xaf20ad37u, 0xb2c3b293u, 0xa8c53430u, 0x3069ac7bu, 0x34302812u, 0xa2563162u,
+ 0x34acacbfu, 0x3455302eu, 0x32bbb353u, 0xb3422d43u, 0x2f252ac7u, 0xa704b4afu, 0xafdc323fu, 0xa86ea65eu,
+ 0x3404af9bu, 0xb37a3167u, 0x334834c6u, 0x3278b026u, 0x34cbb38fu, 0x2dc42e5du, 0x339fb3ddu, 0xb0fab486u,
+ 0x3150b2dbu, 0x33e2b1cbu, 0xb4742e00u, 0xb44eb4bfu, 0x31ca2c11u, 0x32b5b105u, 0x31c7b440u, 0x3139341bu,
+ 0x327d2f9cu, 0xb1bab46au, 0x1991b334u, 0x2cfe30b5u, 0xb29f32beu, 0xb1e53081u, 0x3008b067u, 0x2c49349cu,
+ 0x2c77b447u, 0x3360b465u, 0xb2473006u, 0xb213b3d7u, 0xa65d349cu, 0x2d3d3174u, 0xb2d02990u, 0xafa13448u,
+ 0x2fac29feu, 0x343b2dbbu, 0x1d22b2c0u, 0xa3efab5fu, 0xb306b350u, 0xaf80b043u, 0x2c43a989u, 0xaac62d2bu,
+ 0xb16cab01u, 0xaf072ac8u, 0xaa44b474u, 0xb145a3f2u, 0x290d2991u, 0x2dae2fc2u, 0xaf0f2ddau, 0x278c3185u,
+ 0x2cd7a944u, 0x1fd4ad5au, 0x336b308bu, 0x1877340bu, 0x31c2223au, 0x327aaf20u, 0xb3609b33u, 0x3291b41cu,
+ 0xb036b444u, 0xb247ae5fu, 0x30a9af26u, 0x3248b4a9u, 0xace832d9u, 0x2bbfb2a7u, 0xad30b34du, 0x34c23467u,
+ 0xaf423139u, 0x2fe32f35u, 0x2d69ac4fu, 0xb196b4b2u, 0xb27523b5u, 0x3275b26au, 0x284c34b2u, 0x34b53283u,
+ 0xa7f3b2e2u, 0xb408ac20u, 0xa91630e7u, 0xb2b5a4b6u, 0x33d1b220u, 0xb121b45fu, 0x9e06affcu, 0x9c1f2aa4u,
+ 0xb0ecb3fcu, 0x2d493299u, 0x2e892dbau, 0xb43e310cu, 0x2612ad1fu, 0x329dae34u, 0x3128a15bu, 0x19e332c2u,
+ 0x2ab133ddu, 0xae1f32bau, 0x24d391d1u, 0xabcbb396u, 0x2d063402u, 0xae30b231u, 0xb490b1ecu, 0xa7f5341bu,
+ 0x2b90af64u, 0xb043b4bbu, 0x2d232fccu, 0x2c9f34a0u, 0x3105a2e9u, 0x303d33beu, 0x316a3472u, 0xb369330bu,
+ 0xa89a3076u, 0x2deb2814u, 0x34a73483u, 0x307db011u, 0xade530cdu, 0xb468b339u, 0x9e543153u, 0xa56134a9u,
+ 0xaaca3497u, 0xb3f931a4u, 0x31cd2842u, 0x32323414u, 0xace3b472u, 0xb380b455u, 0x30182ebbu, 0x33043141u,
+ 0x31c73099u, 0xb119b454u, 0x32e02caeu, 0x207eb4c2u, 0xb4842ecfu, 0x3399ab93u, 0xb1092e97u, 0xadd632c6u,
+ 0xafb832c9u, 0xabea2af0u, 0x336cb053u, 0xb3f9b200u, 0x302eae12u, 0x34ca31e7u, 0xab12afd2u, 0x29c0b2f9u,
+ 0x2fb734c7u, 0xac222b50u, 0x979433f9u, 0xad2bb305u, 0xb1b9b428u, 0xa72db4a1u, 0xae042d2bu, 0x3469aa1du,
+ 0x264730d6u, 0x339fb023u, 0xaeb5b116u, 0x248a33dbu, 0x2af830a7u, 0xafb42de4u, 0xaed1b0f7u, 0x3330b29eu,
+ 0x28b9b029u, 0x3173319bu, 0xa34ba8bbu, 0x2eb434c0u, 0x33bb320bu, 0xb20b3186u, 0xb3a528c6u, 0x345f2ddfu,
+ 0xa9261fd3u, 0x346ab475u, 0xb468b39fu, 0xb42cb0e0u, 0x20f1a6e5u, 0xb450af33u, 0xac6fb375u, 0x2f9cb438u,
+ 0xaf9ab1a0u, 0xaa68ac11u, 0xb373b4c9u, 0xb4ca32f7u, 0x9e731d05u, 0xa946ae69u, 0x328d3163u, 0xaed1b09au,
+ 0xa230b0f2u, 0xb1382f0au, 0x3422ae80u, 0xa607b455u, 0xb2b63010u, 0xb2f2b458u, 0xb4b63405u, 0xb480b1fcu,
+ 0x2c9db37au, 0x2951b0f6u, 0x32b62aedu, 0x32c9b4c1u, 0xb27a2c93u, 0x32d3313eu, 0x3405b0b8u, 0x2bf1a6ffu,
+ 0xad5134a7u, 0xaef93203u, 0x2bbd31bfu, 0xaa9ab172u, 0xb40daf01u, 0xade5b483u, 0xb26cb49eu, 0x2ffe3053u,
+ 0xaf053095u, 0x2b35337du, 0xb2d7b32eu, 0xb2482f6au, 0x34b91c7bu, 0xb4a4b4c3u, 0x2a8034bcu, 0x33a1b32au,
+ 0x258f334eu, 0xb05b2cadu, 0x2b43b451u, 0x2e48afe2u, 0xb4a03275u, 0xb1292b5au, 0xb0bb332eu, 0x281d2c41u,
+ 0x2ed2abf9u, 0x29243056u, 0x34a430f6u, 0x207baa33u, 0x31afb4aeu, 0xab122237u, 0x337bb3cbu, 0x2f03ac08u,
+ 0x346eb2bbu, 0xb1c5b22du, 0x33ec32e4u, 0xa4a3b187u, 0x3344307cu, 0x213aaca5u, 0x307030a4u, 0x295f316fu,
+ 0x33c2b397u, 0x31b93305u, 0xb1adb3afu, 0xb49430adu, 0xacb8349au, 0x33713036u, 0xaef2ac0cu, 0x2a382c2du,
+ 0x2bd1aafau, 0xa4f8342eu, 0xacb7b1d3u, 0xb315ac11u, 0x2f16b279u, 0x345eae2au, 0xb3b3b0ecu, 0x335130deu,
+ 0xb1b8b043u, 0x22c3b209u, 0xb09ea4dau, 0xa1b0b45eu, 0x2ddb3469u, 0xb37a9986u, 0xafe1b0c1u, 0x333c3116u,
+ 0x34a733b6u, 0x345934a1u, 0xb2f4b41cu, 0x2810af82u, 0x32a4b3bdu, 0x26822c7cu, 0xb0bdb26du, 0x32c2b286u,
+ 0x30842a78u, 0xacf2afd5u, 0x30feab4fu, 0xb1a1313bu, 0xb349343du, 0xb3ac339bu, 0x32a7b085u, 0xaaa3b227u,
+ 0xb0e4a7b4u, 0x32bf3009u, 0xae2c3331u, 0xb0d524bdu, 0xb281b0e6u, 0x33733439u, 0x216d3153u, 0x24929dc5u,
+ 0xa907259eu, 0xb330b312u, 0xa1853457u, 0xb276345au, 0xb19d282eu, 0xb483b0bdu, 0x3400b351u, 0x2c27aedau,
+ 0xaba5a560u, 0xb20124e6u, 0x321d34b1u, 0xa4cc30a4u, 0x340ab2aau, 0xb452ae17u, 0x31a7ae0du, 0x30d12cb6u,
+ 0xb18831a3u, 0xa8c33411u, 0xb4c72d57u, 0xb03534bdu, 0xa669b434u, 0xb2adb31fu, 0xac1d2d14u, 0xaef2340du,
+ 0xa5fa3058u, 0x2ba82e24u, 0x3452a42eu, 0xb232ae0au, 0x32a52ed7u, 0xa7bdb46au, 0x30cb3389u, 0x24d334b9u,
+ 0xaf962e25u, 0xad22344cu, 0xab703094u, 0x303828bfu, 0x33d11d7du, 0x2da6aa33u, 0xab0eae0eu, 0xb32fa89bu,
+ 0x2e6eb3d7u, 0x2e412df9u, 0xaea9b49cu, 0x3157b1cdu, 0xb0dd32edu, 0xb31e2e72u, 0xb2f0b051u, 0x2eb6b028u,
+ 0xb1b633e1u, 0x2fbc2677u, 0x2c3a3459u, 0x2cd2b0b4u, 0x3492aee5u, 0x2f1fabb4u, 0xabadb494u, 0x2c3c3334u,
+ 0x342fa84eu, 0xaed432ccu, 0x9e4126d8u, 0xae5f2d14u, 0x33ecb0e9u, 0x32983412u, 0x30b43497u, 0x310b3115u,
+ 0xa6b0a1f5u, 0x2d90b0fdu, 0xb0a6b00cu, 0xaaea2a9au, 0x3211b166u, 0xb26132ecu, 0xae4bb4bau, 0x328331b6u,
+ 0x3144b2f2u, 0xac39b361u, 0xb1e734c9u, 0xa9eeb2a5u, 0x34282c98u, 0xb422349eu, 0x3195b2f3u, 0x3364b067u,
+ 0xb3742e47u, 0xb2373262u, 0x2802a9e0u, 0x2f260f88u, 0x34b92dddu, 0x210e34cbu, 0x3060b3c6u, 0xaefa3493u,
+ 0xa9c7a5f5u, 0x2e9db44eu, 0x3185ada2u, 0x322b3260u, 0xaedaaa66u, 0x31c930c7u, 0x338c9d5fu, 0x3347b2d6u,
+ 0xb2a7b3c1u, 0xb4272533u, 0xb4ccb44bu, 0x3146a8e4u, 0x2dd9b477u, 0xb2603234u, 0x32edb2c8u, 0x25fb3234u,
+ 0xb3d23221u, 0x3091ac0du, 0x30d8b0a8u, 0x30ce1922u, 0xad13a56du, 0xb199b164u, 0xb35130d4u, 0xb237b3f3u,
+ 0x3234b23cu, 0x34b5abcbu, 0x2de5b2f7u, 0xac1da7e0u, 0x34913334u, 0xb1e83329u, 0x2dfeb15cu, 0x3082b00du,
+ 0x27d2291au, 0x300b324fu, 0xb453b48bu, 0xa03e2bcdu, 0xaa3fb09bu, 0x324530f4u, 0xac6728b4u, 0x22bcb067u,
+ 0x28ecaf1du, 0xb42a2ec9u, 0xb186b4cau, 0x2d9e3393u, 0x337eb24cu, 0xa358a49du, 0xb43b342eu, 0x2a96b178u,
+ 0x2ee5b1d9u, 0xb3b2b17du, 0xa9efb1a0u, 0xb304b238u, 0xaee632ebu, 0xb445ad59u, 0xb084ad7au, 0xb1a82c3eu,
+ 0x34203375u, 0xb38ab3afu, 0x2c97b363u, 0x32c4b342u, 0x322430e2u, 0x34c52401u, 0xab95b460u, 0x344faec3u,
+ 0x31e129e7u, 0x3498b254u, 0x9bc792b0u, 0xac943485u, 0x2a82abdfu, 0xaedfb324u, 0xb39eb1e4u, 0x2adc3308u,
+ 0xab04ad6du, 0x2d1da942u, 0x34283419u, 0xae8eac2bu, 0x2938a785u, 0x2e19a5ddu, 0xaee2282eu, 0xaca9adedu,
+ 0xb269b3c8u, 0xac423049u, 0x28d0b17bu, 0xb0a830e8u, 0x341b3246u, 0x2e7d2b7bu, 0x3447b44cu, 0x3412ac21u,
+ 0xb4c12a11u, 0x2eeeb46eu, 0xb48c28d0u, 0x32c330adu, 0xa5429867u, 0x3181b4bfu, 0x3346ae80u, 0x28fface0u,
+ 0xb284b069u, 0x98313147u, 0x3478259bu, 0x32d43473u, 0x2b3db1ffu, 0xacd23343u, 0x31f8b3a9u, 0x28f2a19au,
+ 0x2c1e34a1u, 0xacfea9c3u, 0xb354aa54u, 0x327c31a9u, 0xaf89b024u, 0xb4923330u, 0xa5942249u, 0xb2762f4du,
+ 0xacbc2e77u, 0xb46a2c8fu, 0x22d6a5e1u, 0xb2ebb17cu, 0xadfcb1fbu, 0x344bb3beu, 0x31efab4cu, 0xb406348eu,
+ 0x33222dd3u, 0xa9732d28u, 0xb475a94bu, 0x34bd2551u, 0xb46a3467u, 0xaf9d2fe7u, 0xb45e33bdu, 0x327b342fu,
+ 0xb43ab358u, 0xb29c326du, 0x33fb255cu, 0xabe4292eu, 0xb432a839u, 0xaf453480u, 0xb01cb1f7u, 0xb3ca3475u,
+ 0x2ffb330du, 0x34923256u, 0x34423292u, 0x31aeb32au, 0x26ae2704u, 0x348833a9u, 0xa13e2fdeu, 0xac5da6b2u,
+ 0xac2bad51u, 0xb40eb3e0u, 0xa8243160u, 0x2c4f3422u, 0xb293b476u, 0x3491b1a6u, 0x20d7b167u, 0x34953430u,
+ 0xb454b110u, 0xb431b2a7u, 0x2944b1dbu, 0x25b833ebu, 0xb1b92cd7u, 0x2c8631c5u, 0x344f32feu, 0xb3dcadd7u,
+ 0xa8c6b143u, 0xacac3447u, 0x26383133u, 0xaee2b217u, 0x2ea32de2u, 0xb136a903u, 0xb25c3043u, 0xb1fe3431u,
+ 0x95c8b38au, 0x1d092201u, 0x34c0a8ddu, 0xa2c6a037u, 0xb429aa02u, 0xb1392b24u, 0xb4502d49u, 0xa4c3349fu,
+ 0xa382339eu, 0x3230b439u, 0x31692db1u, 0x2d4ca984u, 0x2d0832d7u, 0x2e3eadd9u, 0x346c2e99u, 0xa9c1acf5u,
+ 0x1b1531feu, 0xa630308bu, 0x30b5b176u, 0xb1cab315u, 0x30bf2cdfu, 0xb05e3394u, 0x3461b379u, 0xb1ceb2a3u,
+ 0x2484b002u, 0x3101335au, 0xb23d2ffeu, 0xa2a2ad77u, 0xb05ab2c5u, 0xb328af5cu, 0x2fe027b8u, 0xa1cdb421u,
+ 0x30c3316bu, 0xa7f3320du, 0xb0fa3228u, 0xb237ad58u, 0xac25a4ffu, 0x30d6b0f9u, 0xb05daf26u, 0xa8b3b316u,
+ 0xad05af91u, 0x310db112u, 0x95153421u, 0x330d2d7eu, 0x2c5c34cau, 0x33d8337cu, 0xa6003199u, 0xb4a6b405u,
+ 0x9afca301u, 0x293cab1du, 0x32772802u, 0xb147b384u, 0x32ae31a0u, 0x332934c0u, 0xaf21337cu, 0xb074ab8du,
+ 0x34362c15u, 0x2861adf7u, 0xb257afd2u, 0xb43eb1dau, 0xab753450u, 0x2ef7326au, 0xb23bb1eau, 0xb08431fdu,
+ 0xb2e933efu, 0x320a2ed3u, 0xb04cb115u, 0xb0853476u, 0xa92fa961u, 0x314cb0beu, 0xacac3138u, 0xb182326fu,
+ 0x2977b4c2u, 0xb46e34beu, 0x21d0b19bu, 0xb3e431f3u, 0x30beac06u, 0xb1f9acefu, 0xaae9b1d9u, 0x32f2a253u,
+ 0x2f502d76u, 0xaac534a3u, 0x3410ab6cu, 0x306ab0e1u, 0xb3b930e3u, 0xb1c0a4e1u, 0xa58eb20au, 0x31afb00bu,
+ 0x31d033d3u, 0xb056b07eu, 0xb302b0b6u, 0xab863443u, 0x32a8b149u, 0xb476abbeu, 0x2d59b27eu, 0xb18d316fu,
+ 0x32bd3402u, 0xb13b32b0u, 0x330eb451u, 0x2c91af75u, 0xb32330feu, 0x34363164u, 0xa8843063u, 0x2438333bu,
+ 0x322730b3u, 0xb01330cau, 0xb106b356u, 0x1a7c1711u, 0xac072a59u, 0x33ef3182u, 0x308bb0ddu, 0x23182ed6u,
+ 0x31abb1fbu, 0x3042346cu, 0x34a63403u, 0x2d4fadaeu, 0x34aca401u, 0xaabbb3cdu, 0xb0253336u, 0xac803187u,
+ 0x335d349cu, 0x3130afa1u, 0x2c16ae44u, 0x32542e16u, 0xb15fb491u, 0x28e13012u, 0xb4459967u, 0x3303add4u,
+ 0xabf4b481u, 0xb4503446u, 0xb0a730cbu, 0x30b9330au, 0xb1743368u, 0x315d24ceu, 0xb12bb0e2u, 0x3467b48au,
+ 0x2d353008u, 0x24db30afu, 0xa7c0a355u, 0xb1bb2d65u, 0x31fc3373u, 0x2a0aadc7u, 0x3489b2e4u, 0x340d33bcu,
+ 0xb25aae72u, 0xb3dfb189u, 0xb41eb34bu, 0xb41734b4u, 0x3249b30eu, 0x31452a2au, 0x33562d07u, 0x22cc2d51u,
+ 0xa80a3016u, 0xa4f8ade9u, 0x31b2b257u, 0xb2a7b098u, 0x3291b0dfu, 0x2f25ae85u, 0xb40e32d8u, 0xaf633117u,
+ 0x2f41b171u, 0xafdf33deu, 0xb0092949u, 0x2cae282bu, 0x2cbf2c3du, 0x2f6fb302u, 0x2f032eb6u, 0xa815b417u,
+ 0xb3efaafcu, 0xa889b155u, 0xb10c30aau, 0x27093255u, 0xa4373100u, 0x2c998fecu, 0x9d08341cu, 0x2bd52467u,
+ 0xb41c308fu, 0xa1a820d2u, 0x9bfb340cu, 0x3470a4deu, 0xb221a797u, 0xb3c6b16fu, 0xa93ab4cbu, 0x307c312bu,
+ 0x2960b47eu, 0x343cb2a3u, 0x2cddb263u, 0x2d70aeacu, 0x30fcaabdu, 0x33cbb12cu, 0x30eb081fu, 0x34ca32b7u,
+ 0xb017b14du, 0xb08eb0cbu, 0x34bd2a55u, 0x241d34b5u, 0x32e92d57u, 0xb1b03428u, 0x32642462u, 0xa89faf8fu,
+ 0x34a834a9u, 0x31bb33ccu, 0xaee1b06fu, 0x28ab2743u, 0x2d401ac3u, 0x30c9ab0au, 0xae81ae64u, 0xb1fab328u,
+ 0xa82032bbu, 0x30fbb13au, 0xb4992825u, 0xb234af0du, 0xb0522dfcu, 0xb446b42du, 0xb4972a47u, 0xb29e32b2u,
+ 0xa83f2c18u, 0xb41ca864u, 0x338c31d0u, 0xb22cb4b2u, 0x279a33c1u, 0xb1b5b2b8u, 0x30512e25u, 0x345a2ba3u,
+ 0xafab9b4bu, 0xad64a2feu, 0xb45cb14bu, 0x300fadadu, 0xa8acb49fu, 0x2c3d2d88u, 0x31f63150u, 0xb3a03011u,
+ 0x2bf1a3acu, 0xb464b0e3u, 0xa6eeb14fu, 0xb235aa9cu, 0x3416323bu, 0x3420b1bcu, 0x3414b4a1u, 0xb4af3457u,
+ 0x3484310du, 0x348533cbu, 0xb40d27bbu, 0x2c5f32b7u, 0xaa5b2c68u, 0xb2a72984u,
+};
+
+// 256 uint16 values (raw f16 bits)
+static const uint16_t kCnnV3ExpectedEnc0U16[256] = {
+ 0x0000u, 0x0000u, 0x350cu, 0x3b3cu, 0x19bcu, 0x0000u, 0x0000u, 0x3d10u,
+ 0x31e9u, 0x0000u, 0x35d0u, 0x39c3u, 0x0000u, 0x0000u, 0x2c6fu, 0x35fbu,
+ 0x39b9u, 0x0000u, 0x0000u, 0x3538u, 0x2ebbu, 0x0000u, 0x34f8u, 0x0000u,
+ 0x0000u, 0x0000u, 0x0000u, 0x3c96u, 0x0000u, 0x3029u, 0x0000u, 0x0000u,
+ 0x0000u, 0x0000u, 0x0000u, 0x405au, 0x0000u, 0x367eu, 0x0000u, 0x3d2fu,
+ 0x383bu, 0x0000u, 0x342cu, 0x3f97u, 0x0000u, 0x3c3cu, 0x0000u, 0x424eu,
+ 0x0000u, 0x0000u, 0x0000u, 0x3a3au, 0x0000u, 0x3d8fu, 0x0000u, 0x3fd4u,
+ 0x307du, 0x0000u, 0x0000u, 0x3f68u, 0x0000u, 0x0000u, 0x0000u, 0x3c81u,
+ 0x0000u, 0x0000u, 0x398fu, 0x3ffeu, 0x0000u, 0x0000u, 0x0000u, 0x3ec1u,
+ 0x0000u, 0x39b8u, 0x0000u, 0x3c61u, 0x0000u, 0x2e3au, 0x3699u, 0x41deu,
+ 0x0000u, 0x0000u, 0x0000u, 0x3d2cu, 0x329au, 0x0000u, 0x0000u, 0x41a9u,
+ 0x2d70u, 0x342fu, 0x0000u, 0x4066u, 0x2c77u, 0x0000u, 0x37b7u, 0x3842u,
+ 0x2b9au, 0x0000u, 0x3655u, 0x4001u, 0x340au, 0x0000u, 0x30f5u, 0x41a5u,
+ 0x0000u, 0x0000u, 0x0000u, 0x3d05u, 0x0000u, 0x0000u, 0x30a6u, 0x40a3u,
+ 0x0000u, 0x0000u, 0x0000u, 0x4263u, 0x0000u, 0x0000u, 0x0000u, 0x3e62u,
+ 0x0000u, 0x0000u, 0x0000u, 0x42d7u, 0x0000u, 0x0000u, 0x0000u, 0x3de8u,
+ 0x0000u, 0x0000u, 0x0000u, 0x3f4du, 0x0000u, 0x38d4u, 0x3a61u, 0x3fb7u,
+ 0x0000u, 0x0000u, 0x0000u, 0x404cu, 0x3811u, 0x31a4u, 0x0000u, 0x3edfu,
+ 0x0000u, 0x0000u, 0x0000u, 0x3f30u, 0x0000u, 0x0000u, 0x0000u, 0x3ec7u,
+ 0x27dau, 0x0000u, 0x0000u, 0x3efeu, 0x0000u, 0x3027u, 0x0000u, 0x39ceu,
+ 0x28e8u, 0x0000u, 0x0000u, 0x4121u, 0x0000u, 0x0000u, 0x0000u, 0x40eeu,
+ 0x3b70u, 0x3379u, 0x0000u, 0x40d3u, 0x0000u, 0x0000u, 0x0000u, 0x3d88u,
+ 0x329du, 0x0000u, 0x0000u, 0x3fafu, 0x35c0u, 0x0000u, 0x374cu, 0x40ceu,
+ 0x32b4u, 0x2c9au, 0x0000u, 0x4094u, 0x3105u, 0x31f4u, 0x34e9u, 0x3cd7u,
+ 0x0000u, 0x0000u, 0x344bu, 0x3cd1u, 0x0000u, 0x2d13u, 0x0000u, 0x3e7eu,
+ 0x0000u, 0x2eacu, 0x0000u, 0x4123u, 0x0000u, 0x36edu, 0x0000u, 0x3c69u,
+ 0x0000u, 0x0000u, 0x0000u, 0x41d5u, 0x0000u, 0x36e4u, 0x0000u, 0x4049u,
+ 0x0000u, 0x0000u, 0x0000u, 0x401du, 0x0000u, 0x38d1u, 0x333au, 0x3b08u,
+ 0x0000u, 0x0000u, 0x0000u, 0x3d12u, 0x0000u, 0x0000u, 0x0000u, 0x3e6eu,
+ 0x0000u, 0x0000u, 0x0000u, 0x4028u, 0x0000u, 0x0000u, 0x0000u, 0x3f64u,
+ 0x0000u, 0x0000u, 0x0000u, 0x3e4bu, 0x2eeau, 0x393cu, 0x0000u, 0x4007u,
+ 0x0000u, 0x267fu, 0x0000u, 0x3eabu, 0x35b4u, 0x38f9u, 0x0000u, 0x3e6bu,
+};
+
+// kCnnV3Dec1HW = (W/2) x (H/2) = 4 x 4
+// 64 uint16 values (raw f16 bits)
+static const uint16_t kCnnV3ExpectedDec1U16[64] = {
+ 0x0000u, 0x2692u, 0x3823u, 0x397eu, 0x0000u, 0x22dcu, 0x35dcu, 0x35f9u,
+ 0x0000u, 0x3936u, 0x24b5u, 0x3434u, 0x0000u, 0x3b63u, 0x0000u, 0x32fcu,
+ 0x0000u, 0x2913u, 0x3523u, 0x33d6u, 0x0000u, 0x3023u, 0x2575u, 0x0000u,
+ 0x0000u, 0x39edu, 0x0000u, 0x0000u, 0x0000u, 0x3c91u, 0x0000u, 0x0000u,
+ 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u,
+ 0x0000u, 0x3754u, 0x0000u, 0x0000u, 0x318cu, 0x3a4du, 0x0000u, 0x0000u,
+ 0x3206u, 0x32deu, 0x0000u, 0x0000u, 0x317du, 0x3437u, 0x0000u, 0x0000u,
+ 0x312au, 0x357fu, 0x0000u, 0x0000u, 0x0000u, 0x39b5u, 0x0000u, 0x0000u,
+};
+
+// 256 uint16 values (raw f16 bits)
+static const uint16_t kCnnV3ExpectedOutputU16[256] = {
+ 0x3800u, 0x3934u, 0x3800u, 0x38aau, 0x384au, 0x3800u, 0x3800u, 0x3917u,
+ 0x38d5u, 0x3800u, 0x3800u, 0x38f2u, 0x3800u, 0x38c9u, 0x3800u, 0x38d4u,
+ 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x38dau, 0x3800u, 0x3800u,
+ 0x3800u, 0x383eu, 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x3800u,
+ 0x396du, 0x38eeu, 0x3800u, 0x3a87u, 0x3899u, 0x3800u, 0x3800u, 0x3972u,
+ 0x3a4au, 0x3800u, 0x3800u, 0x3847u, 0x386du, 0x3800u, 0x3800u, 0x3a70u,
+ 0x3800u, 0x381fu, 0x3800u, 0x3800u, 0x3800u, 0x3945u, 0x3800u, 0x392eu,
+ 0x3800u, 0x3800u, 0x3800u, 0x3844u, 0x3800u, 0x3800u, 0x3820u, 0x3800u,
+ 0x3a6du, 0x3832u, 0x3800u, 0x3ab0u, 0x3909u, 0x3800u, 0x3800u, 0x3a12u,
+ 0x3873u, 0x3800u, 0x3800u, 0x39b8u, 0x3a9au, 0x3800u, 0x3800u, 0x3a41u,
+ 0x3800u, 0x3800u, 0x3800u, 0x38d0u, 0x3952u, 0x3800u, 0x3800u, 0x398cu,
+ 0x3800u, 0x3800u, 0x3800u, 0x3a21u, 0x3800u, 0x3800u, 0x3800u, 0x3800u,
+ 0x3950u, 0x3800u, 0x3800u, 0x3abdu, 0x39ccu, 0x3800u, 0x3800u, 0x39e0u,
+ 0x3800u, 0x3800u, 0x3800u, 0x3a62u, 0x38d7u, 0x3800u, 0x3800u, 0x3a23u,
+ 0x3858u, 0x3800u, 0x3800u, 0x39f8u, 0x3800u, 0x3800u, 0x3800u, 0x3a01u,
+ 0x38e7u, 0x3800u, 0x3800u, 0x3822u, 0x38fcu, 0x3800u, 0x3832u, 0x3800u,
+ 0x3840u, 0x383au, 0x3800u, 0x3b39u, 0x390du, 0x3800u, 0x3800u, 0x399bu,
+ 0x3800u, 0x3800u, 0x3800u, 0x39c2u, 0x3802u, 0x3800u, 0x3800u, 0x3a41u,
+ 0x398bu, 0x3800u, 0x3800u, 0x39fau, 0x3800u, 0x3800u, 0x3800u, 0x396au,
+ 0x38d3u, 0x3800u, 0x3800u, 0x3888u, 0x3909u, 0x3800u, 0x3800u, 0x3800u,
+ 0x3863u, 0x3800u, 0x3800u, 0x3ae8u, 0x3a06u, 0x3800u, 0x3800u, 0x3a7du,
+ 0x38c1u, 0x3800u, 0x3800u, 0x3a20u, 0x38cdu, 0x3800u, 0x3800u, 0x390cu,
+ 0x3820u, 0x3800u, 0x3800u, 0x39d5u, 0x3863u, 0x3800u, 0x3800u, 0x389cu,
+ 0x3800u, 0x3800u, 0x3800u, 0x38bcu, 0x3887u, 0x3800u, 0x3866u, 0x3800u,
+ 0x38bbu, 0x3800u, 0x3800u, 0x3a8du, 0x394cu, 0x3800u, 0x3800u, 0x39b9u,
+ 0x394au, 0x3800u, 0x3800u, 0x3977u, 0x3800u, 0x3800u, 0x3800u, 0x3906u,
+ 0x3800u, 0x3800u, 0x386bu, 0x3a02u, 0x38bbu, 0x3800u, 0x3800u, 0x39d7u,
+ 0x38a2u, 0x3800u, 0x3800u, 0x3800u, 0x3899u, 0x3800u, 0x3811u, 0x3800u,
+ 0x3830u, 0x3800u, 0x387au, 0x3918u, 0x386au, 0x3800u, 0x38acu, 0x39f0u,
+ 0x39c7u, 0x3800u, 0x38beu, 0x3988u, 0x38c3u, 0x3800u, 0x3930u, 0x39d5u,
+ 0x397bu, 0x3800u, 0x3918u, 0x3a09u, 0x394cu, 0x3800u, 0x3952u, 0x3961u,
+ 0x3980u, 0x3800u, 0x392eu, 0x3872u, 0x39c2u, 0x3800u, 0x3903u, 0x3800u,
+};
+
diff --git a/cnn_v3/training/gen_test_vectors.py b/cnn_v3/training/gen_test_vectors.py
new file mode 100644
index 0000000..640971c
--- /dev/null
+++ b/cnn_v3/training/gen_test_vectors.py
@@ -0,0 +1,451 @@
+#!/usr/bin/env python3
+# CNN v3 parity reference — numpy forward pass matching WGSL shaders exactly.
+# Generates test vectors for C++ GPU parity validation.
+#
+# Usage:
+# python3 cnn_v3/training/gen_test_vectors.py # self-test only
+# python3 cnn_v3/training/gen_test_vectors.py --header # emit C header to stdout
+
+import numpy as np
+import struct
+import sys
+import argparse
+
+# ---------------------------------------------------------------------------
+# Weight layout (f16 units, matching C++ cnn_v3_effect.cc constants)
+# ---------------------------------------------------------------------------
+
+ENC0_IN, ENC0_OUT = 20, 4
+ENC1_IN, ENC1_OUT = 4, 8
+BN_IN, BN_OUT = 8, 8
+DEC1_IN, DEC1_OUT = 16, 4
+DEC0_IN, DEC0_OUT = 8, 4
+
+ENC0_WEIGHTS = ENC0_IN * ENC0_OUT * 9 + ENC0_OUT # 724
+ENC1_WEIGHTS = ENC1_IN * ENC1_OUT * 9 + ENC1_OUT # 296
+BN_WEIGHTS = BN_IN * BN_OUT * 1 + BN_OUT # 72
+DEC1_WEIGHTS = DEC1_IN * DEC1_OUT * 9 + DEC1_OUT # 580
+DEC0_WEIGHTS = DEC0_IN * DEC0_OUT * 9 + DEC0_OUT # 292
+
+ENC0_OFFSET = 0
+ENC1_OFFSET = ENC0_OFFSET + ENC0_WEIGHTS
+BN_OFFSET = ENC1_OFFSET + ENC1_WEIGHTS
+DEC1_OFFSET = BN_OFFSET + BN_WEIGHTS
+DEC0_OFFSET = DEC1_OFFSET + DEC1_WEIGHTS
+TOTAL_F16 = DEC0_OFFSET + DEC0_WEIGHTS # 1964 + 292 = 2256? let me check
+# 724 + 296 + 72 + 580 + 292 = 1964 ... actually let me recount
+# ENC0: 20*4*9 + 4 = 720+4 = 724
+# ENC1: 4*8*9 + 8 = 288+8 = 296
+# BN: 8*8*1 + 8 = 64+8 = 72
+# DEC1: 16*4*9 + 4 = 576+4 = 580
+# DEC0: 8*4*9 + 4 = 288+4 = 292
+# Total = 724+296+72+580+292 = 1964 ... but HOWTO.md says 2064. Let me recheck.
+# DEC1: 16*4*9 = 576 ... but the shader says Conv(16->4) which is IN=16, OUT=4
+# weight idx: o * DEC1_IN * 9 + i * 9 + ki where o<DEC1_OUT, i<DEC1_IN
+# So total conv weights = DEC1_OUT * DEC1_IN * 9 = 4*16*9 = 576, bias = 4
+# Total DEC1 = 580. OK that's right.
+# Let me add: 724+296+72+580+292 = 1964. But HOWTO says 2064?
+# DEC1: Conv(16->4) = OUT*IN*K^2 = 4*16*9 = 576 + bias 4 = 580. HOWTO says 576+4=580 OK.
+# Total = 724+296+72+580+292 = let me sum: 724+296=1020, +72=1092, +580=1672, +292=1964.
+# Hmm, HOWTO.md says 2064. Let me recheck HOWTO weight table:
+# enc0: 20*4*9=720 +4 = 724
+# enc1: 4*8*9=288 +8 = 296
+# bottleneck: 8*8*1=64 +8 = 72
+# dec1: 16*4*9=576 +4 = 580
+# dec0: 8*4*9=288 +4 = 292
+# Total = 724+296+72+580+292 = 1964
+# The HOWTO says 2064 but I get 1964... 100 difference. Possible typo in doc.
+# I'll use the correct value derived from the formulas: 1964.
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def get_w(w_f32, base, idx):
+ """Read one f16-precision weight. Matches WGSL get_w()."""
+ return float(w_f32[base + idx])
+
+
+# ---------------------------------------------------------------------------
+# Layer forward passes — each matches the corresponding WGSL compute shader
+# ---------------------------------------------------------------------------
+
+def enc0_forward(feat0, feat1, w, gamma, beta):
+ """
+ Conv(20->4, 3x3, zero-pad) + FiLM + ReLU → rgba16float (f16 stored).
+ feat0: (H, W, 8) f32 — channels from unpack2x16float(feat_tex0)
+ feat1: (H, W, 12) f32 — channels from unpack4x8unorm(feat_tex1)
+ gamma, beta: (ENC0_OUT,) f32 — FiLM params
+ Returns: (H, W, 4) f32 — f16 precision (rgba16float texture boundary)
+ """
+ H, W = feat0.shape[:2]
+ wo = ENC0_OFFSET
+ feat = np.concatenate([feat0, feat1], axis=2) # (H, W, 20)
+ fp = np.pad(feat, ((1, 1), (1, 1), (0, 0)), mode='constant') # zero-pad
+
+ out = np.zeros((H, W, ENC0_OUT), dtype=np.float32)
+ for o in range(ENC0_OUT):
+ bias = get_w(w, wo, ENC0_OUT * ENC0_IN * 9 + o)
+ s = np.full((H, W), bias, dtype=np.float32)
+ for i in range(ENC0_IN):
+ for ky in range(3):
+ for kx in range(3):
+ wv = get_w(w, wo, o * ENC0_IN * 9 + i * 9 + ky * 3 + kx)
+ s += wv * fp[ky:ky+H, kx:kx+W, i]
+ out[:, :, o] = np.maximum(0.0, gamma[o] * s + beta[o])
+
+ return np.float16(out).astype(np.float32) # rgba16float texture boundary
+
+
+def enc1_forward(enc0, w, gamma_lo, gamma_hi, beta_lo, beta_hi):
+ """
+ AvgPool2x2(enc0, clamp-border) + Conv(4->8, 3x3, zero-pad) + FiLM + ReLU
+ → rgba32uint (pack2x16float, f16 precision, half-res).
+ enc0: (H, W, 4) f32 — rgba16float precision
+ """
+ H, W = enc0.shape[:2]
+ hH, hW = H // 2, W // 2
+ wo = ENC1_OFFSET
+
+ # AvgPool2x2 with clamp at borders (matches load_enc0_avg in WGSL)
+ avg = np.zeros((hH, hW, ENC1_IN), dtype=np.float32)
+ for hy in range(hH):
+ for hx in range(hW):
+ s = np.zeros(ENC1_IN, dtype=np.float32)
+ for dy in range(2):
+ for dx in range(2):
+ fy = min(hy * 2 + dy, H - 1)
+ fx = min(hx * 2 + dx, W - 1)
+ s += enc0[fy, fx, :]
+ avg[hy, hx, :] = s * 0.25
+
+ # 3x3 conv with zero-padding at half-res borders
+ ap = np.pad(avg, ((1, 1), (1, 1), (0, 0)), mode='constant')
+ gamma = np.concatenate([gamma_lo, gamma_hi])
+ beta = np.concatenate([beta_lo, beta_hi])
+
+ out = np.zeros((hH, hW, ENC1_OUT), dtype=np.float32)
+ for o in range(ENC1_OUT):
+ bias = get_w(w, wo, ENC1_OUT * ENC1_IN * 9 + o)
+ s = np.full((hH, hW), bias, dtype=np.float32)
+ for i in range(ENC1_IN):
+ for ky in range(3):
+ for kx in range(3):
+ wv = get_w(w, wo, o * ENC1_IN * 9 + i * 9 + ky * 3 + kx)
+ s += wv * ap[ky:ky+hH, kx:kx+hW, i]
+ out[:, :, o] = np.maximum(0.0, gamma[o] * s + beta[o])
+
+ return np.float16(out).astype(np.float32) # pack2x16float boundary
+
+
+def bottleneck_forward(enc1, w):
+ """
+ AvgPool2x2(enc1, clamp-border) + Conv(8->8, 1x1) + ReLU
+ → rgba32uint (f16, quarter-res). No FiLM.
+ enc1: (hH, hW, 8) f32 — half-res
+ """
+ hH, hW = enc1.shape[:2]
+ qH, qW = hH // 2, hW // 2
+ wo = BN_OFFSET
+
+ # AvgPool2x2 with clamp (matches load_enc1_avg in WGSL)
+ avg = np.zeros((qH, qW, BN_IN), dtype=np.float32)
+ for qy in range(qH):
+ for qx in range(qW):
+ s = np.zeros(BN_IN, dtype=np.float32)
+ for dy in range(2):
+ for dx in range(2):
+ hy = min(qy * 2 + dy, hH - 1)
+ hx = min(qx * 2 + dx, hW - 1)
+ s += enc1[hy, hx, :]
+ avg[qy, qx, :] = s * 0.25
+
+ # 1x1 conv (no spatial loop, just channel dot-product)
+ out = np.zeros((qH, qW, BN_OUT), dtype=np.float32)
+ for o in range(BN_OUT):
+ bias = get_w(w, wo, BN_OUT * BN_IN + o)
+ s = np.full((qH, qW), bias, dtype=np.float32)
+ for i in range(BN_IN):
+ wv = get_w(w, wo, o * BN_IN + i)
+ s += wv * avg[:, :, i]
+ out[:, :, o] = np.maximum(0.0, s)
+
+ return np.float16(out).astype(np.float32) # pack2x16float boundary
+
+
+def dec1_forward(bn, enc1, w, gamma, beta):
+ """
+ NearestUp2x(bn) + cat(enc1_skip) → Conv(16->4, 3x3, zero-pad) + FiLM + ReLU
+ → rgba16float (half-res).
+ bn: (qH, qW, 8) f32 — quarter-res bottleneck
+ enc1: (hH, hW, 8) f32 — half-res skip connection
+ """
+ hH, hW = enc1.shape[:2]
+ qH, qW = bn.shape[:2]
+ wo = DEC1_OFFSET
+
+ # Build 16-channel input: [nearest_up(bn), enc1_skip], zero-padded for 3x3
+ # load_dec1_concat: if OOB → zeros; otherwise nearest_up + enc1
+ fp = np.zeros((hH + 2, hW + 2, DEC1_IN), dtype=np.float32)
+ for hy in range(hH):
+ for hx in range(hW):
+ qy = min(hy // 2, qH - 1)
+ qx = min(hx // 2, qW - 1)
+ fp[hy + 1, hx + 1, :] = np.concatenate([bn[qy, qx, :], enc1[hy, hx, :]])
+
+ out = np.zeros((hH, hW, DEC1_OUT), dtype=np.float32)
+ for o in range(DEC1_OUT):
+ bias = get_w(w, wo, DEC1_OUT * DEC1_IN * 9 + o)
+ s = np.full((hH, hW), bias, dtype=np.float32)
+ for i in range(DEC1_IN):
+ for ky in range(3):
+ for kx in range(3):
+ wv = get_w(w, wo, o * DEC1_IN * 9 + i * 9 + ky * 3 + kx)
+ s += wv * fp[ky:ky+hH, kx:kx+hW, i]
+ out[:, :, o] = np.maximum(0.0, gamma[o] * s + beta[o])
+
+ return np.float16(out).astype(np.float32) # rgba16float boundary
+
+
+def dec0_forward(dec1, enc0, w, gamma, beta):
+ """
+ NearestUp2x(dec1) + cat(enc0_skip) → Conv(8->4, 3x3, zero-pad) + FiLM + ReLU + sigmoid
+ → rgba16float (full-res, final output).
+ dec1: (hH, hW, 4) f32 — half-res
+ enc0: (H, W, 4) f32 — full-res enc0 skip
+ """
+ H, W = enc0.shape[:2]
+ hH, hW = dec1.shape[:2]
+ wo = DEC0_OFFSET
+
+ # Build 8-channel input: [nearest_up(dec1), enc0_skip], zero-padded
+ fp = np.zeros((H + 2, W + 2, DEC0_IN), dtype=np.float32)
+ for y in range(H):
+ for x in range(W):
+ hy = min(y // 2, hH - 1)
+ hx = min(x // 2, hW - 1)
+ fp[y + 1, x + 1, :] = np.concatenate([dec1[hy, hx, :], enc0[y, x, :]])
+
+ out = np.zeros((H, W, DEC0_OUT), dtype=np.float32)
+ for o in range(DEC0_OUT):
+ bias = get_w(w, wo, DEC0_OUT * DEC0_IN * 9 + o)
+ s = np.full((H, W), bias, dtype=np.float32)
+ for i in range(DEC0_IN):
+ for ky in range(3):
+ for kx in range(3):
+ wv = get_w(w, wo, o * DEC0_IN * 9 + i * 9 + ky * 3 + kx)
+ s += wv * fp[ky:ky+H, kx:kx+W, i]
+ # FiLM + ReLU + sigmoid (matches WGSL dec0 shader)
+ v = np.maximum(0.0, gamma[o] * s + beta[o])
+ out[:, :, o] = 1.0 / (1.0 + np.exp(-v.astype(np.float64))).astype(np.float32)
+
+ return np.float16(out).astype(np.float32) # rgba16float boundary
+
+
+def forward_pass(feat0, feat1, w_f32, film):
+ """Full U-Net forward pass. film is a dict of gamma/beta arrays."""
+ enc0 = enc0_forward(feat0, feat1, w_f32,
+ film['enc0_gamma'], film['enc0_beta'])
+ enc1 = enc1_forward(enc0, w_f32,
+ film['enc1_gamma_lo'], film['enc1_gamma_hi'],
+ film['enc1_beta_lo'], film['enc1_beta_hi'])
+ bn = bottleneck_forward(enc1, w_f32)
+ dc1 = dec1_forward(bn, enc1, w_f32, film['dec1_gamma'], film['dec1_beta'])
+ dc0 = dec0_forward(dc1, enc0, w_f32, film['dec0_gamma'], film['dec0_beta'])
+ return dc0
+
+
+def identity_film():
+ return {
+ 'enc0_gamma': np.ones(ENC0_OUT, dtype=np.float32),
+ 'enc0_beta': np.zeros(ENC0_OUT, dtype=np.float32),
+ 'enc1_gamma_lo': np.ones(4, dtype=np.float32),
+ 'enc1_gamma_hi': np.ones(4, dtype=np.float32),
+ 'enc1_beta_lo': np.zeros(4, dtype=np.float32),
+ 'enc1_beta_hi': np.zeros(4, dtype=np.float32),
+ 'dec1_gamma': np.ones(DEC1_OUT, dtype=np.float32),
+ 'dec1_beta': np.zeros(DEC1_OUT, dtype=np.float32),
+ 'dec0_gamma': np.ones(DEC0_OUT, dtype=np.float32),
+ 'dec0_beta': np.zeros(DEC0_OUT, dtype=np.float32),
+ }
+
+
+# ---------------------------------------------------------------------------
+# Self-test: zero weights → output must be exactly 0.5
+# ---------------------------------------------------------------------------
+
+def test_zero_weights():
+ H, W = 8, 8
+ w = np.zeros(TOTAL_F16, dtype=np.float32)
+ feat0 = np.zeros((H, W, 8), dtype=np.float32)
+ feat1 = np.zeros((H, W, 12), dtype=np.float32)
+ out = forward_pass(feat0, feat1, w, identity_film())
+ max_err = float(np.max(np.abs(out - 0.5)))
+ ok = max_err < 1e-5
+ print(f"[test_zero_weights] max_err={max_err:.2e} {'OK' if ok else 'FAIL'}",
+ file=sys.stderr)
+ return ok
+
+
+# ---------------------------------------------------------------------------
+# Test vector generation and C header emission
+# ---------------------------------------------------------------------------
+
+def pack_feat0_rgba32uint(feat0_f32, H, W):
+ """Pack (H*W, 8) f16-precision values as H*W*4 u32 (pack2x16float layout)."""
+ f16 = np.float16(feat0_f32.reshape(H * W, 8))
+ u16 = f16.view(np.uint16) # (H*W, 8) u16
+ u32 = np.zeros((H * W, 4), dtype=np.uint32)
+ for j in range(4):
+ u32[:, j] = u16[:, j*2].astype(np.uint32) | (u16[:, j*2+1].astype(np.uint32) << 16)
+ return u32.flatten() # H*W*4 u32
+
+
+def pack_feat1_rgba32uint(feat1_u8, H, W):
+ """Pack (H*W, 12) u8 values as H*W*4 u32 (pack4x8unorm, 4th u32 = 0)."""
+ u8 = feat1_u8.reshape(H * W, 12)
+ u32 = np.zeros((H * W, 4), dtype=np.uint32)
+ for j in range(3):
+ for b in range(4):
+ u32[:, j] |= u8[:, j*4+b].astype(np.uint32) << (b * 8)
+ return u32.flatten() # H*W*4 u32
+
+
+def pack_weights_u32(w_f16):
+ """Pack flat f16 array as u32 pairs matching WGSL get_w() layout."""
+ # Pad to even count
+ if len(w_f16) % 2:
+ w_f16 = np.append(w_f16, np.float16(0))
+ u16 = w_f16.view(np.uint16)
+ u32 = u16[::2].astype(np.uint32) | (u16[1::2].astype(np.uint32) << 16)
+ return u32
+
+
+def generate_vectors(W=8, H=8, seed=42):
+ rng = np.random.default_rng(seed)
+
+ # Random f16 weights (small range to avoid NaN/Inf cascading)
+ w_f16 = rng.uniform(-0.3, 0.3, TOTAL_F16).astype(np.float16)
+ w_f32 = w_f16.astype(np.float32)
+
+ # Random feat0: 8 f16-precision channels
+ feat0_f16 = rng.uniform(0.0, 1.0, (H, W, 8)).astype(np.float16)
+ feat0 = feat0_f16.astype(np.float32)
+
+ # Random feat1: 12 u8 channels (unpacked as unorm [0,1])
+ feat1_u8 = rng.integers(0, 256, (H, W, 12), dtype=np.uint8)
+ feat1 = feat1_u8.astype(np.float32) / 255.0
+
+ film = identity_film()
+ enc0 = enc0_forward(feat0, feat1, w_f32,
+ film['enc0_gamma'], film['enc0_beta'])
+ enc1 = enc1_forward(enc0, w_f32,
+ film['enc1_gamma_lo'], film['enc1_gamma_hi'],
+ film['enc1_beta_lo'], film['enc1_beta_hi'])
+ bn = bottleneck_forward(enc1, w_f32)
+ dc1 = dec1_forward(bn, enc1, w_f32, film['dec1_gamma'], film['dec1_beta'])
+ out = dec0_forward(dc1, enc0, w_f32, film['dec0_gamma'], film['dec0_beta'])
+
+ feat0_u32 = pack_feat0_rgba32uint(feat0, H, W)
+ feat1_u32 = pack_feat1_rgba32uint(feat1_u8, H, W)
+ w_u32 = pack_weights_u32(w_f16)
+ enc0_u16 = np.float16(enc0.reshape(-1)).view(np.uint16)
+ # dec1 is half-res (hH x hW x 4); store as-is
+ dc1_u16 = np.float16(dc1.reshape(-1)).view(np.uint16)
+ out_u16 = np.float16(out.reshape(-1)).view(np.uint16) # raw f16 bits
+
+ return {
+ 'W': W, 'H': H, 'seed': seed,
+ 'feat0_u32': feat0_u32,
+ 'feat1_u32': feat1_u32,
+ 'w_u32': w_u32,
+ 'enc0_u16': enc0_u16,
+ 'dc1_u16': dc1_u16,
+ 'out_u16': out_u16,
+ 'out_f32': out.reshape(-1),
+ }
+
+
+def emit_c_header(v):
+ lines = []
+ lines.append("// Auto-generated by cnn_v3/training/gen_test_vectors.py")
+ lines.append(f"// Seed={v['seed']} W={v['W']} H={v['H']}")
+ lines.append("// DO NOT EDIT — regenerate with gen_test_vectors.py --header")
+ lines.append("#pragma once")
+ lines.append("#include <cstdint>")
+ lines.append("")
+ lines.append(f"static const int kCnnV3TestW = {v['W']};")
+ lines.append(f"static const int kCnnV3TestH = {v['H']};")
+ lines.append("")
+
+ def array_u32(name, data):
+ lines.append(f"// {len(data)} u32 values")
+ lines.append(f"static const uint32_t {name}[{len(data)}] = {{")
+ row = []
+ for i, x in enumerate(data):
+ row.append(f"0x{int(x):08x}u")
+ if len(row) == 8 or i == len(data) - 1:
+ lines.append(" " + ", ".join(row) + ",")
+ row = []
+ lines.append("};")
+ lines.append("")
+
+ def array_u16(name, data):
+ lines.append(f"// {len(data)} uint16 values (raw f16 bits)")
+ lines.append(f"static const uint16_t {name}[{len(data)}] = {{")
+ row = []
+ for i, x in enumerate(data):
+ row.append(f"0x{int(x):04x}u")
+ if len(row) == 8 or i == len(data) - 1:
+ lines.append(" " + ", ".join(row) + ",")
+ row = []
+ lines.append("};")
+ lines.append("")
+
+ array_u32("kCnnV3TestFeat0U32", v['feat0_u32'])
+ array_u32("kCnnV3TestFeat1U32", v['feat1_u32'])
+ array_u32("kCnnV3TestWeightsU32", v['w_u32'])
+ array_u16("kCnnV3ExpectedEnc0U16", v['enc0_u16'])
+ lines.append(f"// kCnnV3Dec1HW = (W/2) x (H/2) = {v['W']//2} x {v['H']//2}")
+ array_u16("kCnnV3ExpectedDec1U16", v['dc1_u16'])
+ array_u16("kCnnV3ExpectedOutputU16", v['out_u16'])
+ return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def main():
+ parser = argparse.ArgumentParser(description="CNN v3 parity test vector generator")
+ parser.add_argument('--header', action='store_true',
+ help='Emit C header to stdout')
+ parser.add_argument('--W', type=int, default=8)
+ parser.add_argument('--H', type=int, default=8)
+ parser.add_argument('--seed', type=int, default=42)
+ args = parser.parse_args()
+
+ # Send self-test output to stderr so --header stdout stays clean
+ import io
+ log = sys.stderr if args.header else sys.stdout
+
+ ok = test_zero_weights()
+ if not ok:
+ sys.exit(1)
+
+ if args.header:
+ v = generate_vectors(args.W, args.H, args.seed)
+ print(emit_c_header(v)) # C header → stdout only
+ print("All checks passed.", file=log)
+ else:
+ v = generate_vectors(args.W, args.H, args.seed)
+ out = v['out_f32']
+ print(f"[gen_test_vectors] W={args.W} H={args.H} seed={args.seed}")
+ print(f" output range: [{float(out.min()):.4f}, {float(out.max()):.4f}]")
+ print(f" output mean: {float(out.mean()):.4f}")
+ print(" Run with --header to emit C header for C++ parity test.")
+ print("All checks passed.")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/doc/COMPLETED.md b/doc/COMPLETED.md
index 6868484..7b925f0 100644
--- a/doc/COMPLETED.md
+++ b/doc/COMPLETED.md
@@ -36,6 +36,8 @@ Completed task archive. See `doc/archive/` for detailed historical documents.
## March 2026
+- [x] **CNN v3 Phase 5: Parity validation** — `test_cnn_v3_parity.cc` (2 tests: zero_weights, random_weights). Root cause: intermediate nodes declared at full res instead of W/2, W/4. Fix: `NodeRegistry::default_width()/default_height()` getters + fractional resolution in `declare_nodes()`. Final max_err=4.88e-4 ✓. 36/36 tests.
+
- [x] **CNN v3 Phase 4: C++ CNNv3Effect + FiLM uniform upload** — `cnn_v3/src/cnn_v3_effect.{h,cc}`. 5 compute passes (enc0→enc1→bottleneck→dec1→dec0), shared f16 weights buffer, per-pass uniform buffers, `set_film_params()` API. Key fix: WGSL `vec3u` has align=16, so `CnnV3Params4ch`=64B and `CnnV3ParamsEnc1`=96B (not 48/80). Weight offsets as explicit formulas. FiLM γ/β identity defaults; real values await `train_cnn_v3.py`. 35/35 tests.
- [x] **NTSC post-process effect** — Fisheye distortion + NTSC scan-line simulation as `WgslEffect` thin wrappers. Common logic in `render/ntsc_common` snippet (`sample_ntsc_signal` hook). Two variants: `ntsc_rgb.wgsl` (RGB→YIQ internally, `Ntsc`) and `ntsc_yiq.wgsl` (YIQ passthrough, `NtscYiq`, for RotatingCube output). Files: `src/effects/ntsc_rgb.wgsl`, `ntsc_yiq.wgsl`, `src/shaders/render/ntsc_common.wgsl`, `ntsc_effect.h`. Tests: 36/36.
diff --git a/src/gpu/sequence.h b/src/gpu/sequence.h
index 4592082..04482fb 100644
--- a/src/gpu/sequence.h
+++ b/src/gpu/sequence.h
@@ -71,6 +71,9 @@ class NodeRegistry {
void set_external_view(const std::string& name, WGPUTextureView view);
+ int default_width() const { return default_width_; }
+ int default_height() const { return default_height_; }
+
private:
WGPUDevice device_;
int default_width_;
diff --git a/src/tests/gpu/test_cnn_v3_parity.cc b/src/tests/gpu/test_cnn_v3_parity.cc
new file mode 100644
index 0000000..608decb
--- /dev/null
+++ b/src/tests/gpu/test_cnn_v3_parity.cc
@@ -0,0 +1,370 @@
+// CNN v3 parity test: validates WGSL shaders against Python reference.
+// Two checks:
+// 1. Zero-weight test (deterministic): output must be sigmoid(0) = 0.5
+// 2. Random-weight test: output must match Python-generated test vectors
+// (within 1/255 per pixel)
+
+#include "../common/webgpu_test_fixture.h"
+#include "cnn_v3/src/cnn_v3_effect.h"
+#include "gpu/sequence.h"
+#include "../../cnn_v3/test_vectors.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// fp16 decode (matches GPU read)
+// ---------------------------------------------------------------------------
+
+static float fp16_bits_to_f32(uint16_t h) {
+ uint32_t sign = (h & 0x8000u) << 16;
+ uint32_t exp = (h & 0x7C00u) >> 10;
+ uint32_t mant = (h & 0x03FFu);
+ if (exp == 0 && mant == 0) {
+ float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r;
+ }
+ if (exp == 31) {
+ uint32_t b = sign | 0x7F800000u | (mant << 13);
+ float r; __builtin_memcpy(&r, &b, 4); return r;
+ }
+ uint32_t b = sign | ((exp + 112) << 23) | (mant << 13);
+ float r; __builtin_memcpy(&r, &b, 4); return r;
+}
+
+// ---------------------------------------------------------------------------
+// Raw RGBA16Float readback → flat array of f32 (one per channel per pixel)
+// ---------------------------------------------------------------------------
+
+struct MapState { bool done = false; WGPUMapAsyncStatus status; };
+
+static std::vector<float> readback_rgba16float(WGPUDevice device,
+ WGPUQueue queue,
+ WGPUTexture tex,
+ int W, int H) {
+ const uint32_t bytes_per_px = 8; // 4 × f16
+ const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px);
+ const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u;
+ const size_t buf_size = aligned_bpr * (size_t)H;
+
+ WGPUBufferDescriptor bd = {};
+ bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+ bd.size = buf_size;
+ WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd);
+
+ WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr);
+ WGPUTexelCopyTextureInfo src = {};
+ src.texture = tex;
+ WGPUTexelCopyBufferInfo dst = {};
+ dst.buffer = staging;
+ dst.layout.bytesPerRow = aligned_bpr;
+ dst.layout.rowsPerImage = (uint32_t)H;
+ WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 };
+ wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent);
+ WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+ wgpuQueueSubmit(queue, 1, &cmds);
+ wgpuCommandBufferRelease(cmds);
+ wgpuCommandEncoderRelease(enc);
+ wgpuDevicePoll(device, true, nullptr);
+
+ MapState ms = {};
+ WGPUBufferMapCallbackInfo mi = {};
+ mi.mode = WGPUCallbackMode_AllowProcessEvents;
+ mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) {
+ auto* st = (MapState*)u;
+ st->status = s; st->done = true;
+ };
+ mi.userdata1 = &ms;
+ wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi);
+ for (int i = 0; i < 100 && !ms.done; ++i)
+ wgpuDevicePoll(device, true, nullptr);
+
+ std::vector<float> result(W * H * 4, 0.0f);
+ if (ms.done && ms.status == WGPUMapAsyncStatus_Success) {
+ const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange(
+ staging, 0, buf_size);
+ if (mapped) {
+ for (int y = 0; y < H; ++y) {
+ const uint16_t* row =
+ (const uint16_t*)(mapped + (size_t)y * aligned_bpr);
+ for (int x = 0; x < W; ++x) {
+ for (int c = 0; c < 4; ++c) {
+ result[(y * W + x) * 4 + c] =
+ fp16_bits_to_f32(row[x * 4 + c]);
+ }
+ }
+ }
+ }
+ }
+ wgpuBufferUnmap(staging);
+ wgpuBufferRelease(staging);
+ return result;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: create rgba32uint texture with TextureBinding | CopyDst
+// ---------------------------------------------------------------------------
+
+static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) {
+ WGPUTextureDescriptor d = {};
+ d.format = WGPUTextureFormat_RGBA32Uint;
+ d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst;
+ d.dimension = WGPUTextureDimension_2D;
+ d.size = { (uint32_t)W, (uint32_t)H, 1 };
+ d.mipLevelCount = 1;
+ d.sampleCount = 1;
+ return wgpuDeviceCreateTexture(dev, &d);
+}
+
+static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) {
+ WGPUTextureDescriptor d = {};
+ d.format = WGPUTextureFormat_RGBA16Float;
+ d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc;
+ d.dimension = WGPUTextureDimension_2D;
+ d.size = { (uint32_t)W, (uint32_t)H, 1 };
+ d.mipLevelCount = 1;
+ d.sampleCount = 1;
+ return wgpuDeviceCreateTexture(dev, &d);
+}
+
+static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) {
+ WGPUTextureViewDescriptor d = {};
+ d.format = fmt;
+ d.dimension = WGPUTextureViewDimension_2D;
+ d.mipLevelCount = 1;
+ d.arrayLayerCount = 1;
+ return wgpuTextureCreateView(tex, &d);
+}
+
+// ---------------------------------------------------------------------------
+// Run one CNN v3 forward pass and return output pixels
+// ---------------------------------------------------------------------------
+
+static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture,
+ int W, int H,
+ const uint32_t* feat0_u32, // W*H*4
+ const uint32_t* feat1_u32, // W*H*4
+ const uint32_t* weights_u32, // (TOTAL_F16+1)/2
+ uint32_t weights_bytes,
+ std::vector<float>* enc0_out = nullptr,
+ std::vector<float>* dec1_out = nullptr) {
+ GpuContext ctx = fixture.ctx();
+
+ // Create input textures manually (with CopyDst for upload)
+ WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H);
+ WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H);
+ WGPUTexture out_tex = make_output_tex(ctx.device, W, H);
+
+ WGPUTextureView feat0_view =
+ make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint);
+ WGPUTextureView feat1_view =
+ make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint);
+ WGPUTextureView out_view =
+ make_view(out_tex, WGPUTextureFormat_RGBA16Float);
+
+ // Upload feat texture data
+ auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) {
+ WGPUTexelCopyTextureInfo dst_tex = {};
+ dst_tex.texture = tex;
+ WGPUTexelCopyBufferLayout layout = {};
+ layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel
+ layout.rowsPerImage = (uint32_t)H;
+ WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 };
+ wgpuQueueWriteTexture(ctx.queue, &dst_tex, data,
+ (size_t)(W * H * 16), &layout, &ext);
+ };
+ upload_tex(feat0_tex, feat0_u32);
+ upload_tex(feat1_tex, feat1_u32);
+
+ // Wire into NodeRegistry via external views
+ NodeRegistry registry(ctx.device, W, H);
+ registry.set_external_view("feat0", feat0_view);
+ registry.set_external_view("feat1", feat1_view);
+ registry.set_external_view("cnn3_out", out_view);
+
+ CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn3_out"}, 0.0f, 1000.0f);
+ effect.declare_nodes(registry);
+
+ if (weights_u32) {
+ effect.upload_weights(ctx.queue, weights_u32, weights_bytes);
+ }
+
+ // Run 5 compute passes
+ WGPUCommandEncoder enc =
+ wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+ UniformsSequenceParams params = {};
+ params.resolution = { (float)W, (float)H };
+ params.aspect_ratio = 1.0f;
+ effect.render(enc, params, registry);
+
+ WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+ wgpuQueueSubmit(ctx.queue, 1, &cmds);
+ wgpuCommandBufferRelease(cmds);
+ wgpuCommandEncoderRelease(enc);
+ wgpuDevicePoll(ctx.device, true, nullptr);
+
+ // Read back output
+ auto pixels = readback_rgba16float(ctx.device, ctx.queue, out_tex, W, H);
+
+ // Optional: read back intermediate layers
+ if (enc0_out) {
+ WGPUTexture enc0_tex = registry.get_texture("cnn3_out_enc0");
+ *enc0_out = readback_rgba16float(ctx.device, ctx.queue, enc0_tex, W, H);
+ }
+ if (dec1_out) {
+ WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1");
+ // dec1 is rgba16float written at half-res (W/2, H/2) — read only valid region
+ *dec1_out = readback_rgba16float(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2);
+ }
+
+ // Cleanup
+ wgpuTextureViewRelease(feat0_view);
+ wgpuTextureViewRelease(feat1_view);
+ wgpuTextureViewRelease(out_view);
+ wgpuTextureRelease(feat0_tex);
+ wgpuTextureRelease(feat1_tex);
+ wgpuTextureRelease(out_tex);
+
+ return pixels;
+}
+
+extern void InitShaderComposer();
+
+// ---------------------------------------------------------------------------
+// Test 1: zero weights → sigmoid(ReLU(0)) = 0.5 for all pixels/channels
+// ---------------------------------------------------------------------------
+
+static int test_zero_weights() {
+ fprintf(stdout, " [cnn_v3_parity] test_zero_weights...\n");
+
+ WebGPUTestFixture fixture;
+ if (!fixture.init()) {
+ fprintf(stdout, " ⚠ WebGPU unavailable — skip\n");
+ return 1;
+ }
+ InitShaderComposer();
+
+ const int W = 8, H = 8;
+ std::vector<uint32_t> feat0(W * H * 4, 0u);
+ std::vector<uint32_t> feat1(W * H * 4, 0u);
+
+ auto pixels = run_cnn_v3(fixture, W, H,
+ feat0.data(), feat1.data(),
+ nullptr, 0); // null = zero weights (default)
+
+ // Expected: sigmoid(0) = 0.5 exactly
+ const float expected = 0.5f;
+ const float tol = 1.0f / 255.0f;
+ float max_err = 0.0f;
+ for (float v : pixels)
+ max_err = fmaxf(max_err, fabsf(v - expected));
+
+ if (max_err > tol) {
+ fprintf(stderr, " ✗ zero_weights: max_err=%.5f > %.5f\n", max_err, tol);
+ return 0;
+ }
+ fprintf(stdout, " ✓ zero_weights: max_err=%.2e OK\n", max_err);
+ return 1;
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: random weights — compare to Python reference test vectors
+// ---------------------------------------------------------------------------
+
+static int test_random_weights() {
+ fprintf(stdout, " [cnn_v3_parity] test_random_weights (seed=42)...\n");
+
+ WebGPUTestFixture fixture;
+ if (!fixture.init()) {
+ fprintf(stdout, " ⚠ WebGPU unavailable — skip\n");
+ return 1;
+ }
+ InitShaderComposer();
+
+ const int W = kCnnV3TestW, H = kCnnV3TestH;
+ const uint32_t weights_bytes =
+ (uint32_t)sizeof(kCnnV3TestWeightsU32);
+
+ std::vector<float> enc0_pixels, dec1_pixels;
+ auto pixels = run_cnn_v3(fixture, W, H,
+ kCnnV3TestFeat0U32, kCnnV3TestFeat1U32,
+ kCnnV3TestWeightsU32, weights_bytes,
+ &enc0_pixels, &dec1_pixels);
+
+ // Check enc0 layer first
+ const float tol = 1.0f / 255.0f;
+ float enc0_max_err = 0.0f;
+ int enc0_worst = -1;
+ for (int i = 0; i < W * H * 4; ++i) {
+ float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]);
+ float err = fabsf(enc0_pixels[i] - ref);
+ if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; }
+ }
+ if (enc0_max_err > tol) {
+ int px = enc0_worst / 4, ch = enc0_worst % 4;
+ fprintf(stderr, " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d"
+ " gpu=%.5f ref=%.5f\n",
+ enc0_max_err, tol, px, ch,
+ enc0_pixels[enc0_worst],
+ fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst]));
+ } else {
+ fprintf(stdout, " ✓ enc0: max_err=%.2e OK\n", enc0_max_err);
+ }
+
+ // Check dec1 layer (half-res: W/2 x H/2 x 4)
+ float dec1_max_err = 0.0f;
+ int dec1_worst = -1;
+ int dec1_n = (W / 2) * (H / 2) * 4;
+ for (int i = 0; i < dec1_n; ++i) {
+ float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]);
+ float err = fabsf(dec1_pixels[i] - ref);
+ if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; }
+ }
+ if (dec1_max_err > tol) {
+ int px = dec1_worst / 4, ch = dec1_worst % 4;
+ fprintf(stderr, " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d"
+ " gpu=%.5f ref=%.5f\n",
+ dec1_max_err, tol, px, ch,
+ dec1_pixels[dec1_worst],
+ fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst]));
+ } else {
+ fprintf(stdout, " ✓ dec1: max_err=%.2e OK\n", dec1_max_err);
+ }
+
+ // Compare final output with Python reference (1/255 tolerance)
+ float max_err = 0.0f;
+ int worst = -1;
+ int n = W * H * 4;
+ for (int i = 0; i < n; ++i) {
+ float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]);
+ float err = fabsf(pixels[i] - ref);
+ if (err > max_err) { max_err = err; worst = i; }
+ }
+
+ if (max_err > tol) {
+ int px = worst / 4, ch = worst % 4;
+ fprintf(stderr, " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d"
+ " gpu=%.5f ref=%.5f\n",
+ max_err, tol, px, ch,
+ pixels[worst],
+ fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst]));
+ return 0;
+ }
+ fprintf(stdout, " ✓ random_weights: max_err=%.2e OK\n", max_err);
+ return 1;
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+int main() {
+ int pass = 0, total = 0;
+
+ ++total; pass += test_zero_weights();
+ ++total; pass += test_random_weights();
+
+ fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total);
+ return (pass == total) ? 0 : 1;
+}