diff options
| -rw-r--r-- | PROJECT_CONTEXT.md | 6 | ||||
| -rw-r--r-- | TODO.md | 3 | ||||
| -rw-r--r-- | cmake/DemoTests.cmake | 18 | ||||
| -rw-r--r-- | cnn_v3/src/cnn_v3_effect.cc | 20 | ||||
| -rw-r--r-- | cnn_v3/src/cnn_v3_effect.h | 4 | ||||
| -rw-r--r-- | cnn_v3/test_vectors.h | 293 | ||||
| -rw-r--r-- | cnn_v3/training/gen_test_vectors.py | 451 | ||||
| -rw-r--r-- | doc/COMPLETED.md | 2 | ||||
| -rw-r--r-- | src/gpu/sequence.h | 3 | ||||
| -rw-r--r-- | src/tests/gpu/test_cnn_v3_parity.cc | 370 |
10 files changed, 1160 insertions, 10 deletions
diff --git a/PROJECT_CONTEXT.md b/PROJECT_CONTEXT.md index 4767185..6219275 100644 --- a/PROJECT_CONTEXT.md +++ b/PROJECT_CONTEXT.md @@ -36,17 +36,17 @@ - **Audio:** Sample-accurate sync. Zero heap allocations per frame. Variable tempo. OLA-IDCT synthesis (v2 .spec): Hann analysis window, rectangular synthesis, 50% overlap, click-free. V1 (raw DCT-512) preserved for generated notes. .spec files regenerated as v2. - **Shaders:** Parameterized effects (UniformHelper, .seq syntax). Beat-synchronized animation support (`beat_time`, `beat_phase`). Modular WGSL composition with ShaderComposer. 27 shared common shaders (math, render, compute). Reusable snippets: `render/scratch_lines`, `render/ntsc_common` (NTSC signal processing, RGB and YIQ input variants via `sample_ntsc_signal` hook), `math/color` (YIQ/NTSC), `math/color_c64` (C64 palette, Bayer dither, border animation). - **3D:** Hybrid SDF/rasterization with BVH. Binary scene loader. Blender pipeline. -- **Effects:** CNN post-processing: CNNEffect (v1) and CNNv2Effect operational. CNN v2: sigmoid activation, storage buffer weights (~3.2 KB), 7D static features, dynamic layers. Training stable, convergence validated. **CNN v3 Phase 4 complete:** `CNNv3Effect` C++ class (5 compute passes, FiLM uniform upload, identity γ/β defaults). `set_film_params()` modulates all layers via beat/audio. WGSL params struct alignment fix (vec3u align=16 → 64/96-byte C++ mirrors). Registered in CMake, shaders.h/cc, demo_effects.h, tests. See `cnn_v3/docs/HOWTO.md`. +- **Effects:** CNN post-processing: CNNEffect (v1) and CNNv2Effect operational. CNN v2: sigmoid activation, storage buffer weights (~3.2 KB), 7D static features, dynamic layers. Training stable, convergence validated. **CNN v3 Phases 1–5 complete:** `CNNv3Effect` C++ class (5 compute passes, FiLM uniform upload, identity γ/β defaults). Parity validated: enc0 max_err=1.95e-3, dec1 max_err=1.95e-3, final max_err=4.88e-4 (all ≤1/255). Key fix: intermediate nodes declared at fractional resolutions (W/2, W/4) via `NodeRegistry::default_width()/default_height()`. See `cnn_v3/docs/HOWTO.md`. - **Tools:** CNN test tool operational. Texture readback utility functional. Timeline editor (web-based, beat-aligned, audio playback). - **Build:** Asset dependency tracking. Size measurement. Hot-reload (debug-only). WSL (Windows 10) supported: native Linux build and cross-compile to `.exe` via `mingw-w64`. - **Sequence:** DAG-based effect routing with explicit node system. Python compiler with topological sort and ping-pong optimization. 12 effects operational (Passthrough, Placeholder, GaussianBlur, Heptagon, Particles, RotatingCube, Hybrid3D, Flash, PeakMeter, Scene1, Scene2, Scratch). Effect times are absolute (seq_compiler adds sequence start offset). See `doc/SEQUENCE.md`. -- **Testing:** **35/35 passing**. +- **Testing:** **36/36 passing**. --- ## Next Up -**Active:** CNN v3 Phase 5 (parity validation), Spectral Brush Editor +**Active:** CNN v3 training (`train_cnn_v3.py`), Spectral Brush Editor **Ongoing:** Test infrastructure maintenance (35/35 passing) **Future:** Size optimization (64k target), 3D enhancements @@ -76,7 +76,8 @@ PyTorch / HTML WebGPU / C++ WebGPU. - Params alignment fix: WGSL `vec3u` align=16 → C++ structs 64/96 bytes - Weight offsets as explicit formulas (e.g. `20*4*9+4`) - FiLM γ/β: identity defaults; real values require trained MLP (see below) -5. Parity validation (test vectors, ≤1/255 per pixel) +5. ✅ Parity validation: test vectors + `test_cnn_v3_parity.cc`. max_err=4.88e-4 (≤1/255). + - Key fix: intermediate nodes at fractional resolutions (W/2, W/4) via `NodeRegistry::default_width()/default_height()` **FiLM MLP training** (blocks meaningful Phase 4 output): - Needs `cnn_v3/training/train_cnn_v3.py` — not yet written diff --git a/cmake/DemoTests.cmake b/cmake/DemoTests.cmake index 0b7fbb7..69b9195 100644 --- a/cmake/DemoTests.cmake +++ b/cmake/DemoTests.cmake @@ -222,6 +222,24 @@ if(NOT DEMO_STRIP_ALL AND DEMO_WORKSPACE STREQUAL "main") STB_IMAGE_WRITE_IMPLEMENTATION) endif() +# CNN v3 parity test (zero-weight + random-weight vs Python reference) +if(NOT DEMO_STRIP_ALL AND DEMO_WORKSPACE STREQUAL "main") + add_demo_test(test_cnn_v3_parity CnnV3ParityTest gpu + src/tests/gpu/test_cnn_v3_parity.cc + src/tests/common/webgpu_test_fixture.cc + ${PLATFORM_SOURCES} + ${GEN_DEMO_CC}) + + target_include_directories(test_cnn_v3_parity PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cnn_v3/src) + + target_link_libraries(test_cnn_v3_parity PRIVATE + gpu util procedural ${DEMO_LIBS}) + + demo_add_asset_deps(test_cnn_v3_parity shaders) +endif() + # GPU Composite Texture Test (Phase 4) add_demo_test(test_gpu_composite GpuCompositeTest gpu src/tests/gpu/test_gpu_composite.cc diff --git a/cnn_v3/src/cnn_v3_effect.cc b/cnn_v3/src/cnn_v3_effect.cc index d13799c..92178f7 100644 --- a/cnn_v3/src/cnn_v3_effect.cc +++ b/cnn_v3/src/cnn_v3_effect.cc @@ -187,14 +187,17 @@ CNNv3Effect::CNNv3Effect(const GpuContext& ctx, // --------------------------------------------------------------------------- void CNNv3Effect::declare_nodes(NodeRegistry& registry) { + const int W = registry.default_width(); + const int H = registry.default_height(); + // enc0_tex: rgba16float full-res - registry.declare_node(node_enc0_, NodeType::GBUF_ALBEDO, -1, -1); - // enc1_tex: rgba32uint half-res - registry.declare_node(node_enc1_, NodeType::GBUF_RGBA32UINT, -1, -1); - // bottleneck_tex: rgba32uint quarter-res — declare at 1/4 resolution - registry.declare_node(node_bottleneck_, NodeType::GBUF_RGBA32UINT, -1, -1); + registry.declare_node(node_enc0_, NodeType::GBUF_ALBEDO, W, H); + // enc1_tex: rgba32uint half-res — shaders use textureDimensions() for bounds + registry.declare_node(node_enc1_, NodeType::GBUF_RGBA32UINT, W / 2, H / 2); + // bottleneck_tex: rgba32uint quarter-res + registry.declare_node(node_bottleneck_, NodeType::GBUF_RGBA32UINT, W / 4, H / 4); // dec1_tex: rgba16float half-res - registry.declare_node(node_dec1_, NodeType::GBUF_ALBEDO, -1, -1); + registry.declare_node(node_dec1_, NodeType::GBUF_ALBEDO, W / 2, H / 2); // output_tex: rgba16float full-res (the declared output_nodes_[0]) } @@ -202,6 +205,11 @@ void CNNv3Effect::declare_nodes(NodeRegistry& registry) { // set_film_params — simple linear mapping, no MLP yet // --------------------------------------------------------------------------- +void CNNv3Effect::upload_weights(WGPUQueue queue, const void* data, + uint32_t size_bytes) { + wgpuQueueWriteBuffer(queue, weights_buf_.buffer, 0, data, size_bytes); +} + void CNNv3Effect::set_film_params(const CNNv3FiLMParams& fp) { // Identity + audio/beat modulation. // Replace with FiLM MLP output once training is done. diff --git a/cnn_v3/src/cnn_v3_effect.h b/cnn_v3/src/cnn_v3_effect.h index c358990..36e2797 100644 --- a/cnn_v3/src/cnn_v3_effect.h +++ b/cnn_v3/src/cnn_v3_effect.h @@ -89,6 +89,10 @@ class CNNv3Effect : public Effect { // Update FiLM conditioning; call before render() each frame. void set_film_params(const CNNv3FiLMParams& fp); + // Upload packed-f16 weights (kWeightsBufBytes bytes of u32 pairs). + // Used for testing and inference from trained .bin files. + void upload_weights(WGPUQueue queue, const void* data, uint32_t size_bytes); + private: // Intermediate node names (prefixed from output[0]) std::string node_enc0_; diff --git a/cnn_v3/test_vectors.h b/cnn_v3/test_vectors.h new file mode 100644 index 0000000..6d1abc5 --- /dev/null +++ b/cnn_v3/test_vectors.h @@ -0,0 +1,293 @@ +// Auto-generated by cnn_v3/training/gen_test_vectors.py +// Seed=42 W=8 H=8 +// DO NOT EDIT — regenerate with gen_test_vectors.py --header +#pragma once +#include <cstdint> + +static const int kCnnV3TestW = 8; +static const int kCnnV3TestH = 8; + +// 256 u32 values +static const uint32_t kCnnV3TestFeat0U32[256] = { + 0x2ccd39ebu, 0x3acb39d7u, 0x3814378fu, 0x3bc134ffu, 0x35e739ddu, 0x33073198u, 0x3b8a376cu, 0x32e339e0u, + 0x360d3ae0u, 0x33bc3ad0u, 0x3b6a38f3u, 0x398b3420u, 0x30d23be6u, 0x39652da8u, 0x2c8f3570u, 0x3a08379bu, + 0x355c3490u, 0x38293ac4u, 0x37243abeu, 0x39353ba0u, 0x3b152f6bu, 0x308837d4u, 0x398030e3u, 0x34962a10u, + 0x370f3079u, 0x382d36a1u, 0x281a3479u, 0x35fb38eau, 0x2ef43936u, 0x33f2230eu, 0x364e374eu, 0x360c3a7bu, + 0x38c0383eu, 0x381f2597u, 0x36be3584u, 0x3a432e6bu, 0x25b33b8au, 0x3a1c38d0u, 0x3a4d348fu, 0x3b6f390fu, + 0x296c3bd9u, 0x3860371eu, 0x356130b2u, 0x24283be7u, 0x3abe373du, 0x37ad352fu, 0x37993bd3u, 0x2a9f3031u, + 0x34413b90u, 0x2dce3808u, 0x3b7136c7u, 0x3bc53805u, 0x38093424u, 0x372c3ae0u, 0x3ad83479u, 0x383f363du, + 0x31f83bd0u, 0x27f434d3u, 0x32683645u, 0x31cd3971u, 0x34373966u, 0x359535afu, 0x377739bcu, 0x3ad235c8u, + 0x32d83893u, 0x357b3b33u, 0x37ea28fdu, 0x33a22fefu, 0x302f39fau, 0x3b7f3a75u, 0x39af38dau, 0x3bf139b5u, + 0x31363577u, 0x38443827u, 0x38e831b1u, 0x3b6c233bu, 0x2910343cu, 0x33b02eeeu, 0x28333462u, 0x322d3478u, + 0x362a360fu, 0x353f356du, 0x26742dbeu, 0x3a0e3278u, 0x3b6e3bedu, 0x38413809u, 0x3a313509u, 0x3ac13a1eu, + 0x36f33b2au, 0x3a743a23u, 0x3b6f34efu, 0x3bf42e0au, 0x2df83a29u, 0x28603940u, 0x3a653a29u, 0x3adb38d2u, + 0x346a2e44u, 0x296f36a0u, 0x343e372cu, 0x36cd3649u, 0x34533b09u, 0x36d13b26u, 0x3805353fu, 0x341e36afu, + 0x30dc3805u, 0x388735a2u, 0x3a97369du, 0x3bc2341cu, 0x3bbe3a47u, 0x308c3ab5u, 0x31703836u, 0x38ac3a8cu, + 0x3b703437u, 0x38832f5fu, 0x2b8839c5u, 0x3a8738c8u, 0x38192c52u, 0x394e3423u, 0x3b7f2f98u, 0x31f43b28u, + 0x38b3352cu, 0x371539bfu, 0x2eaa3100u, 0x37493c00u, 0x37b83afbu, 0x2d9e3b61u, 0x3b702f4cu, 0x35093b94u, + 0x373d35afu, 0x321536a9u, 0x340e3b30u, 0x2c4c39a4u, 0x393b28f6u, 0x393e356du, 0x3b992e04u, 0x3b0339fdu, + 0x351f305eu, 0x384c35e5u, 0x2bc334c0u, 0x341335e7u, 0x324d362du, 0x39043431u, 0x35873636u, 0x3a2d3845u, + 0x38b33610u, 0x382d3bbbu, 0x3a593b47u, 0x36de2b84u, 0x3be53996u, 0x2df03756u, 0x300d387fu, 0x38103a03u, + 0x3af439cau, 0x38e63908u, 0x3abd3a09u, 0x28aa3af4u, 0x32ec3873u, 0x39303ae2u, 0x320536b9u, 0x39a1356du, + 0x2dfd328au, 0x3a1d3b1bu, 0x34ad3265u, 0x39aa3bc7u, 0x34ec38e2u, 0x290f34c9u, 0x298739d4u, 0x39d61cf9u, + 0x3a0d3b97u, 0x37c7378cu, 0x353236fau, 0x36e6382cu, 0x3b2f38c9u, 0x2d0a3bf6u, 0x31c83628u, 0x349935a2u, + 0x3a1d3196u, 0x3b5b37f1u, 0x2c49282cu, 0x2d233674u, 0x3be33434u, 0x325732b0u, 0x37f83897u, 0x360738a5u, + 0x306f3a9du, 0x398536dbu, 0x35ea3af2u, 0x2c6d388bu, 0x2c6d3173u, 0x349c39d3u, 0x2c4039cau, 0x3aaf3ae6u, + 0x26152db1u, 0x3ad42b34u, 0x38633383u, 0x3a5d36d2u, 0x380137d3u, 0x30ce3beau, 0x2aa03aa5u, 0x3b1737a4u, + 0x397b3952u, 0x36b23437u, 0x382c35deu, 0x353b3765u, 0x340334e3u, 0x30cc35d7u, 0x38d13afau, 0x398d3048u, + 0x339a3ac8u, 0x206930d2u, 0x3a192a0cu, 0x29bf3be6u, 0x2c9939fcu, 0x3a0c38bdu, 0x219935bfu, 0x3bee38c3u, + 0x3210341fu, 0x38712feeu, 0x3a5738c6u, 0x3b243a06u, 0x33ea3a72u, 0x34c23872u, 0x3b753547u, 0x3bcc3975u, + 0x384d36acu, 0x2ede37cbu, 0x38393393u, 0x3b742c50u, 0x32562fedu, 0x2e343a1fu, 0x39ce3b34u, 0x39892c64u, + 0x3a0f390eu, 0x39bf3aa0u, 0x352938b0u, 0x3ba83994u, 0x395138b3u, 0x3a0d36feu, 0x31223bfbu, 0x3851327au, + 0x389337b4u, 0x36782a48u, 0x38ae38aau, 0x39c33942u, 0x3a523922u, 0x384d3900u, 0x2e7a38d9u, 0x3838345fu, + 0x396f3afcu, 0x38bd2dc9u, 0x39df3318u, 0x38bf3a9fu, 0x356b38bbu, 0x3aea3724u, 0x382839c9u, 0x2a7335e4u, +}; + +// 256 u32 values +static const uint32_t kCnnV3TestFeat1U32[256] = { + 0xc863b415u, 0x249c220fu, 0x603452c6u, 0x00000000u, 0x316a194cu, 0x291db2cbu, 0x5f96105bu, 0x00000000u, + 0xeb343d39u, 0xf1b365e6u, 0x61b71b05u, 0x00000000u, 0x8151bb9eu, 0xfc56bec5u, 0x3c1e7c24u, 0x00000000u, + 0xf1d859a5u, 0x1b1270e5u, 0x39d19474u, 0x00000000u, 0x569b30dcu, 0x097e59b6u, 0xd0d3b912u, 0x00000000u, + 0xdafc8a80u, 0x6222c0d8u, 0xd61d6364u, 0x00000000u, 0xc5c2f0c4u, 0xcd28e9d7u, 0xcd7e12c4u, 0x00000000u, + 0x92cfbc01u, 0x1c5ebffdu, 0xec699bb5u, 0x00000000u, 0x9bd12023u, 0xe6b94175u, 0xf58751d1u, 0x00000000u, + 0x2fe9e259u, 0x66f28558u, 0x314748e3u, 0x00000000u, 0x0d0aabfcu, 0xf7666903u, 0xec5d90aau, 0x00000000u, + 0xee86a635u, 0xe237f413u, 0xa61606fcu, 0x00000000u, 0x85ab0fd7u, 0xfdd13bdbu, 0x8d6075e2u, 0x00000000u, + 0xa476623cu, 0x3634aa37u, 0xbf284477u, 0x00000000u, 0xd1c78653u, 0xadb3feedu, 0x7fa4408au, 0x00000000u, + 0x32a77b6au, 0x08ac3716u, 0xa0976732u, 0x00000000u, 0xaeda1174u, 0xc5ca1e59u, 0xf353b939u, 0x00000000u, + 0x7f53105cu, 0xd44334dfu, 0xb75edbe4u, 0x00000000u, 0x46f67512u, 0xd859d32du, 0x0da6b677u, 0x00000000u, + 0x9950dc38u, 0xf0badec3u, 0xa8b1d193u, 0x00000000u, 0xefe357bdu, 0x0e606587u, 0x884c5ed2u, 0x00000000u, + 0xc7d63411u, 0xa46ee9f4u, 0xe16ad66fu, 0x00000000u, 0x766cf523u, 0xaebf1396u, 0x6b75be3bu, 0x00000000u, + 0xdf433db5u, 0x1e942c35u, 0x410dffe5u, 0x00000000u, 0x18c4cc46u, 0xb3bcd975u, 0x3b94557eu, 0x00000000u, + 0x512fefb1u, 0xd62e1684u, 0x5c34ef2bu, 0x00000000u, 0x25554402u, 0x055e5375u, 0x3a08ec40u, 0x00000000u, + 0xea28d1a6u, 0x8c71f892u, 0xfead5d3du, 0x00000000u, 0x3712d6e9u, 0x59fa8772u, 0x29c7e9cdu, 0x00000000u, + 0x65fc32ecu, 0x90357e43u, 0xcee18a15u, 0x00000000u, 0x5e3b5c50u, 0xc583129du, 0xa04bf996u, 0x00000000u, + 0x4ab43782u, 0xe9864a08u, 0x6f2ab1c6u, 0x00000000u, 0x26a77c61u, 0xf673703cu, 0xe9d6c9cfu, 0x00000000u, + 0x0caebeeeu, 0xe709951fu, 0xf2875771u, 0x00000000u, 0xd43f1577u, 0x41477617u, 0xa19bf431u, 0x00000000u, + 0x89ca27c9u, 0x9ec1ee6cu, 0x9dcf44adu, 0x00000000u, 0xa3a370ddu, 0x83958e74u, 0xb0c45102u, 0x00000000u, + 0x86cfafcau, 0x04382d70u, 0x09083cf1u, 0x00000000u, 0xf5458e26u, 0xe8c4a35bu, 0x95ea20cbu, 0x00000000u, + 0x2cb1e624u, 0xc80e252fu, 0x24aeadb9u, 0x00000000u, 0x60958ae8u, 0x5471b135u, 0x032c76bcu, 0x00000000u, + 0xce983976u, 0x827df87du, 0x50f5f0adu, 0x00000000u, 0x81d7362fu, 0x00000e99u, 0x6fde87aeu, 0x00000000u, + 0x85033eb4u, 0x56f7b265u, 0xd493d37cu, 0x00000000u, 0x3ff49a3cu, 0x23487a39u, 0x870d2e4fu, 0x00000000u, + 0xe3249135u, 0x60123a68u, 0x0befa03du, 0x00000000u, 0xf84d74b5u, 0x71bd7da9u, 0x2c44f6cbu, 0x00000000u, + 0x9d98f068u, 0x51d59a46u, 0xf0131dceu, 0x00000000u, 0x4b40fe50u, 0x8cd5b0fbu, 0x8b164f67u, 0x00000000u, + 0x3e10a2d3u, 0x7fd0d4b7u, 0x1bec231fu, 0x00000000u, 0xa4cc2cd6u, 0xc22121ffu, 0xf33350e7u, 0x00000000u, + 0x536659b7u, 0x49043fc2u, 0x8c7ec0d7u, 0x00000000u, 0xb1597a41u, 0xfe1228f2u, 0x066908e4u, 0x00000000u, + 0x3d0194e7u, 0x432be415u, 0x4160b66fu, 0x00000000u, 0x76b6560au, 0xdf770ab8u, 0x07ef4642u, 0x00000000u, + 0xd0dafe5cu, 0x9e1f95f4u, 0x9d7dbecdu, 0x00000000u, 0xada5c397u, 0x1d8b6a84u, 0xbf29cf46u, 0x00000000u, + 0x3f858ef0u, 0x843e3a0cu, 0xad47e23fu, 0x00000000u, 0x9a9c1e18u, 0x52b851a8u, 0x65648845u, 0x00000000u, + 0x79fca3a8u, 0x0a8f8f09u, 0xb9dde8cbu, 0x00000000u, 0x199671dfu, 0x7565be28u, 0xa7add019u, 0x00000000u, + 0x14948e21u, 0xfedcb64du, 0x6091bc31u, 0x00000000u, 0x040bae5bu, 0xa89c3b59u, 0x8ebdcac3u, 0x00000000u, +}; + +// 982 u32 values +static const uint32_t kCnnV3TestWeightsU32[982] = { + 0xa8b23143u, 0x2f9432e3u, 0x3491b3cbu, 0x317e3104u, 0xa79fb324u, 0x3419acf6u, 0x32322d86u, 0xb13da859u, + 0xb4302831u, 0x2d0e324au, 0xad9630f5u, 0x338c3485u, 0xb1dd3158u, 0xb461a51du, 0x2f07b2a3u, 0x347d30b3u, + 0xacf9aeb0u, 0xb1f6a4adu, 0xa377b31bu, 0x2e85b13eu, 0x3263a8d4u, 0xaf352fb1u, 0x31da3261u, 0xb010ac52u, + 0xb2eb2f02u, 0xb4bbb1c3u, 0x2e553182u, 0x31642fe1u, 0x2948a64fu, 0xb367b2eau, 0xa4712e77u, 0x31172903u, + 0x281d2d2cu, 0xaf87288cu, 0xa8dcb481u, 0xab06b17bu, 0xb11c32c9u, 0xb033b43eu, 0x2e38afedu, 0x31732861u, + 0xab312e4fu, 0xb2653207u, 0xb3dfb495u, 0xa5db3045u, 0x1123b281u, 0x2f8ab2adu, 0xac92a823u, 0x2d01af9fu, + 0xb3ebad4eu, 0x346fb356u, 0x2fab33d8u, 0x3481b07fu, 0x302a315au, 0xb05fa7c7u, 0x33bbb3c0u, 0xb1b7a6cbu, + 0x2a16af74u, 0x32d9b235u, 0x303730f7u, 0x2ce3a937u, 0x2dc12a75u, 0xaa77b3fbu, 0x9b62b467u, 0xb2d3ae89u, + 0x2abbb39du, 0x3415b253u, 0xade12a3au, 0xb4952afbu, 0xa1703467u, 0xb401316eu, 0x9db6a019u, 0x29823434u, + 0xb079a412u, 0x225aae78u, 0xb498a8b1u, 0x339b3244u, 0x2826b2e8u, 0x2e9db384u, 0x2e1fb033u, 0x3128305cu, + 0x33fdb388u, 0xb471b12eu, 0xacf52836u, 0x31eb3255u, 0x3459af06u, 0x20a0b004u, 0x3430b0afu, 0xb45eb271u, + 0x34baa8fcu, 0x30c63385u, 0x338e3381u, 0xaf1121cbu, 0x2e353139u, 0xb3c9acdau, 0xb09030bdu, 0xb0f93432u, + 0x325bb33eu, 0xb228b2a8u, 0x33312ba2u, 0xaf49b1d4u, 0x34883154u, 0xb2d60f49u, 0xb131b4abu, 0x2ed2b312u, + 0x1bc7b343u, 0x2a3b2f76u, 0x31d7b1c4u, 0x30973023u, 0xb339b315u, 0xabde341bu, 0x9f04afa5u, 0x34602e41u, + 0x3414b01au, 0x283db490u, 0xb3912d25u, 0xaa36b2e8u, 0x2b60347au, 0x31d83428u, 0x3178a503u, 0xb381b4a1u, + 0x31b33253u, 0x24bab122u, 0x33102c12u, 0xaab72bebu, 0xa9b1acd5u, 0x330e2dd6u, 0xb0d7a715u, 0x30b9b10eu, + 0xb3943214u, 0x2b41b429u, 0x323cb2cbu, 0xb2d6af48u, 0xb26c340bu, 0xb2a7b022u, 0xb499b362u, 0xb23fb445u, + 0x2b00b44au, 0xac162ef0u, 0x1990aefdu, 0x32be3333u, 0xb21db462u, 0xb0d0b10eu, 0xaa6e2978u, 0xacdab454u, + 0xb3a6234cu, 0xb44d3267u, 0xb3b23414u, 0x33bb3299u, 0x31cd349bu, 0x2d79315eu, 0xb304315bu, 0x205f258au, + 0xa5b732deu, 0x2d5cac6au, 0xb2ebb07cu, 0xaa62a2ccu, 0xad16b122u, 0xaea0ad21u, 0x2f22aca1u, 0x344fafcdu, + 0xa1dd33feu, 0x2571ae97u, 0x2ddc32b1u, 0x250731d8u, 0xb0112d1bu, 0xb1b73083u, 0x32ed2f7bu, 0x2c64b310u, + 0x3055b3c6u, 0x342fb3fau, 0x3468b2f6u, 0x2b3231c7u, 0x31ab316du, 0xb0bc3448u, 0xb3c62aebu, 0xb2502c76u, + 0x299028fdu, 0x22f4a53au, 0x31bf3111u, 0x2ba69cd2u, 0xb34d3424u, 0xb3eab35au, 0xaa402e10u, 0x2e933144u, + 0x33a6ae63u, 0xb068310au, 0xaf20ad37u, 0xb2c3b293u, 0xa8c53430u, 0x3069ac7bu, 0x34302812u, 0xa2563162u, + 0x34acacbfu, 0x3455302eu, 0x32bbb353u, 0xb3422d43u, 0x2f252ac7u, 0xa704b4afu, 0xafdc323fu, 0xa86ea65eu, + 0x3404af9bu, 0xb37a3167u, 0x334834c6u, 0x3278b026u, 0x34cbb38fu, 0x2dc42e5du, 0x339fb3ddu, 0xb0fab486u, + 0x3150b2dbu, 0x33e2b1cbu, 0xb4742e00u, 0xb44eb4bfu, 0x31ca2c11u, 0x32b5b105u, 0x31c7b440u, 0x3139341bu, + 0x327d2f9cu, 0xb1bab46au, 0x1991b334u, 0x2cfe30b5u, 0xb29f32beu, 0xb1e53081u, 0x3008b067u, 0x2c49349cu, + 0x2c77b447u, 0x3360b465u, 0xb2473006u, 0xb213b3d7u, 0xa65d349cu, 0x2d3d3174u, 0xb2d02990u, 0xafa13448u, + 0x2fac29feu, 0x343b2dbbu, 0x1d22b2c0u, 0xa3efab5fu, 0xb306b350u, 0xaf80b043u, 0x2c43a989u, 0xaac62d2bu, + 0xb16cab01u, 0xaf072ac8u, 0xaa44b474u, 0xb145a3f2u, 0x290d2991u, 0x2dae2fc2u, 0xaf0f2ddau, 0x278c3185u, + 0x2cd7a944u, 0x1fd4ad5au, 0x336b308bu, 0x1877340bu, 0x31c2223au, 0x327aaf20u, 0xb3609b33u, 0x3291b41cu, + 0xb036b444u, 0xb247ae5fu, 0x30a9af26u, 0x3248b4a9u, 0xace832d9u, 0x2bbfb2a7u, 0xad30b34du, 0x34c23467u, + 0xaf423139u, 0x2fe32f35u, 0x2d69ac4fu, 0xb196b4b2u, 0xb27523b5u, 0x3275b26au, 0x284c34b2u, 0x34b53283u, + 0xa7f3b2e2u, 0xb408ac20u, 0xa91630e7u, 0xb2b5a4b6u, 0x33d1b220u, 0xb121b45fu, 0x9e06affcu, 0x9c1f2aa4u, + 0xb0ecb3fcu, 0x2d493299u, 0x2e892dbau, 0xb43e310cu, 0x2612ad1fu, 0x329dae34u, 0x3128a15bu, 0x19e332c2u, + 0x2ab133ddu, 0xae1f32bau, 0x24d391d1u, 0xabcbb396u, 0x2d063402u, 0xae30b231u, 0xb490b1ecu, 0xa7f5341bu, + 0x2b90af64u, 0xb043b4bbu, 0x2d232fccu, 0x2c9f34a0u, 0x3105a2e9u, 0x303d33beu, 0x316a3472u, 0xb369330bu, + 0xa89a3076u, 0x2deb2814u, 0x34a73483u, 0x307db011u, 0xade530cdu, 0xb468b339u, 0x9e543153u, 0xa56134a9u, + 0xaaca3497u, 0xb3f931a4u, 0x31cd2842u, 0x32323414u, 0xace3b472u, 0xb380b455u, 0x30182ebbu, 0x33043141u, + 0x31c73099u, 0xb119b454u, 0x32e02caeu, 0x207eb4c2u, 0xb4842ecfu, 0x3399ab93u, 0xb1092e97u, 0xadd632c6u, + 0xafb832c9u, 0xabea2af0u, 0x336cb053u, 0xb3f9b200u, 0x302eae12u, 0x34ca31e7u, 0xab12afd2u, 0x29c0b2f9u, + 0x2fb734c7u, 0xac222b50u, 0x979433f9u, 0xad2bb305u, 0xb1b9b428u, 0xa72db4a1u, 0xae042d2bu, 0x3469aa1du, + 0x264730d6u, 0x339fb023u, 0xaeb5b116u, 0x248a33dbu, 0x2af830a7u, 0xafb42de4u, 0xaed1b0f7u, 0x3330b29eu, + 0x28b9b029u, 0x3173319bu, 0xa34ba8bbu, 0x2eb434c0u, 0x33bb320bu, 0xb20b3186u, 0xb3a528c6u, 0x345f2ddfu, + 0xa9261fd3u, 0x346ab475u, 0xb468b39fu, 0xb42cb0e0u, 0x20f1a6e5u, 0xb450af33u, 0xac6fb375u, 0x2f9cb438u, + 0xaf9ab1a0u, 0xaa68ac11u, 0xb373b4c9u, 0xb4ca32f7u, 0x9e731d05u, 0xa946ae69u, 0x328d3163u, 0xaed1b09au, + 0xa230b0f2u, 0xb1382f0au, 0x3422ae80u, 0xa607b455u, 0xb2b63010u, 0xb2f2b458u, 0xb4b63405u, 0xb480b1fcu, + 0x2c9db37au, 0x2951b0f6u, 0x32b62aedu, 0x32c9b4c1u, 0xb27a2c93u, 0x32d3313eu, 0x3405b0b8u, 0x2bf1a6ffu, + 0xad5134a7u, 0xaef93203u, 0x2bbd31bfu, 0xaa9ab172u, 0xb40daf01u, 0xade5b483u, 0xb26cb49eu, 0x2ffe3053u, + 0xaf053095u, 0x2b35337du, 0xb2d7b32eu, 0xb2482f6au, 0x34b91c7bu, 0xb4a4b4c3u, 0x2a8034bcu, 0x33a1b32au, + 0x258f334eu, 0xb05b2cadu, 0x2b43b451u, 0x2e48afe2u, 0xb4a03275u, 0xb1292b5au, 0xb0bb332eu, 0x281d2c41u, + 0x2ed2abf9u, 0x29243056u, 0x34a430f6u, 0x207baa33u, 0x31afb4aeu, 0xab122237u, 0x337bb3cbu, 0x2f03ac08u, + 0x346eb2bbu, 0xb1c5b22du, 0x33ec32e4u, 0xa4a3b187u, 0x3344307cu, 0x213aaca5u, 0x307030a4u, 0x295f316fu, + 0x33c2b397u, 0x31b93305u, 0xb1adb3afu, 0xb49430adu, 0xacb8349au, 0x33713036u, 0xaef2ac0cu, 0x2a382c2du, + 0x2bd1aafau, 0xa4f8342eu, 0xacb7b1d3u, 0xb315ac11u, 0x2f16b279u, 0x345eae2au, 0xb3b3b0ecu, 0x335130deu, + 0xb1b8b043u, 0x22c3b209u, 0xb09ea4dau, 0xa1b0b45eu, 0x2ddb3469u, 0xb37a9986u, 0xafe1b0c1u, 0x333c3116u, + 0x34a733b6u, 0x345934a1u, 0xb2f4b41cu, 0x2810af82u, 0x32a4b3bdu, 0x26822c7cu, 0xb0bdb26du, 0x32c2b286u, + 0x30842a78u, 0xacf2afd5u, 0x30feab4fu, 0xb1a1313bu, 0xb349343du, 0xb3ac339bu, 0x32a7b085u, 0xaaa3b227u, + 0xb0e4a7b4u, 0x32bf3009u, 0xae2c3331u, 0xb0d524bdu, 0xb281b0e6u, 0x33733439u, 0x216d3153u, 0x24929dc5u, + 0xa907259eu, 0xb330b312u, 0xa1853457u, 0xb276345au, 0xb19d282eu, 0xb483b0bdu, 0x3400b351u, 0x2c27aedau, + 0xaba5a560u, 0xb20124e6u, 0x321d34b1u, 0xa4cc30a4u, 0x340ab2aau, 0xb452ae17u, 0x31a7ae0du, 0x30d12cb6u, + 0xb18831a3u, 0xa8c33411u, 0xb4c72d57u, 0xb03534bdu, 0xa669b434u, 0xb2adb31fu, 0xac1d2d14u, 0xaef2340du, + 0xa5fa3058u, 0x2ba82e24u, 0x3452a42eu, 0xb232ae0au, 0x32a52ed7u, 0xa7bdb46au, 0x30cb3389u, 0x24d334b9u, + 0xaf962e25u, 0xad22344cu, 0xab703094u, 0x303828bfu, 0x33d11d7du, 0x2da6aa33u, 0xab0eae0eu, 0xb32fa89bu, + 0x2e6eb3d7u, 0x2e412df9u, 0xaea9b49cu, 0x3157b1cdu, 0xb0dd32edu, 0xb31e2e72u, 0xb2f0b051u, 0x2eb6b028u, + 0xb1b633e1u, 0x2fbc2677u, 0x2c3a3459u, 0x2cd2b0b4u, 0x3492aee5u, 0x2f1fabb4u, 0xabadb494u, 0x2c3c3334u, + 0x342fa84eu, 0xaed432ccu, 0x9e4126d8u, 0xae5f2d14u, 0x33ecb0e9u, 0x32983412u, 0x30b43497u, 0x310b3115u, + 0xa6b0a1f5u, 0x2d90b0fdu, 0xb0a6b00cu, 0xaaea2a9au, 0x3211b166u, 0xb26132ecu, 0xae4bb4bau, 0x328331b6u, + 0x3144b2f2u, 0xac39b361u, 0xb1e734c9u, 0xa9eeb2a5u, 0x34282c98u, 0xb422349eu, 0x3195b2f3u, 0x3364b067u, + 0xb3742e47u, 0xb2373262u, 0x2802a9e0u, 0x2f260f88u, 0x34b92dddu, 0x210e34cbu, 0x3060b3c6u, 0xaefa3493u, + 0xa9c7a5f5u, 0x2e9db44eu, 0x3185ada2u, 0x322b3260u, 0xaedaaa66u, 0x31c930c7u, 0x338c9d5fu, 0x3347b2d6u, + 0xb2a7b3c1u, 0xb4272533u, 0xb4ccb44bu, 0x3146a8e4u, 0x2dd9b477u, 0xb2603234u, 0x32edb2c8u, 0x25fb3234u, + 0xb3d23221u, 0x3091ac0du, 0x30d8b0a8u, 0x30ce1922u, 0xad13a56du, 0xb199b164u, 0xb35130d4u, 0xb237b3f3u, + 0x3234b23cu, 0x34b5abcbu, 0x2de5b2f7u, 0xac1da7e0u, 0x34913334u, 0xb1e83329u, 0x2dfeb15cu, 0x3082b00du, + 0x27d2291au, 0x300b324fu, 0xb453b48bu, 0xa03e2bcdu, 0xaa3fb09bu, 0x324530f4u, 0xac6728b4u, 0x22bcb067u, + 0x28ecaf1du, 0xb42a2ec9u, 0xb186b4cau, 0x2d9e3393u, 0x337eb24cu, 0xa358a49du, 0xb43b342eu, 0x2a96b178u, + 0x2ee5b1d9u, 0xb3b2b17du, 0xa9efb1a0u, 0xb304b238u, 0xaee632ebu, 0xb445ad59u, 0xb084ad7au, 0xb1a82c3eu, + 0x34203375u, 0xb38ab3afu, 0x2c97b363u, 0x32c4b342u, 0x322430e2u, 0x34c52401u, 0xab95b460u, 0x344faec3u, + 0x31e129e7u, 0x3498b254u, 0x9bc792b0u, 0xac943485u, 0x2a82abdfu, 0xaedfb324u, 0xb39eb1e4u, 0x2adc3308u, + 0xab04ad6du, 0x2d1da942u, 0x34283419u, 0xae8eac2bu, 0x2938a785u, 0x2e19a5ddu, 0xaee2282eu, 0xaca9adedu, + 0xb269b3c8u, 0xac423049u, 0x28d0b17bu, 0xb0a830e8u, 0x341b3246u, 0x2e7d2b7bu, 0x3447b44cu, 0x3412ac21u, + 0xb4c12a11u, 0x2eeeb46eu, 0xb48c28d0u, 0x32c330adu, 0xa5429867u, 0x3181b4bfu, 0x3346ae80u, 0x28fface0u, + 0xb284b069u, 0x98313147u, 0x3478259bu, 0x32d43473u, 0x2b3db1ffu, 0xacd23343u, 0x31f8b3a9u, 0x28f2a19au, + 0x2c1e34a1u, 0xacfea9c3u, 0xb354aa54u, 0x327c31a9u, 0xaf89b024u, 0xb4923330u, 0xa5942249u, 0xb2762f4du, + 0xacbc2e77u, 0xb46a2c8fu, 0x22d6a5e1u, 0xb2ebb17cu, 0xadfcb1fbu, 0x344bb3beu, 0x31efab4cu, 0xb406348eu, + 0x33222dd3u, 0xa9732d28u, 0xb475a94bu, 0x34bd2551u, 0xb46a3467u, 0xaf9d2fe7u, 0xb45e33bdu, 0x327b342fu, + 0xb43ab358u, 0xb29c326du, 0x33fb255cu, 0xabe4292eu, 0xb432a839u, 0xaf453480u, 0xb01cb1f7u, 0xb3ca3475u, + 0x2ffb330du, 0x34923256u, 0x34423292u, 0x31aeb32au, 0x26ae2704u, 0x348833a9u, 0xa13e2fdeu, 0xac5da6b2u, + 0xac2bad51u, 0xb40eb3e0u, 0xa8243160u, 0x2c4f3422u, 0xb293b476u, 0x3491b1a6u, 0x20d7b167u, 0x34953430u, + 0xb454b110u, 0xb431b2a7u, 0x2944b1dbu, 0x25b833ebu, 0xb1b92cd7u, 0x2c8631c5u, 0x344f32feu, 0xb3dcadd7u, + 0xa8c6b143u, 0xacac3447u, 0x26383133u, 0xaee2b217u, 0x2ea32de2u, 0xb136a903u, 0xb25c3043u, 0xb1fe3431u, + 0x95c8b38au, 0x1d092201u, 0x34c0a8ddu, 0xa2c6a037u, 0xb429aa02u, 0xb1392b24u, 0xb4502d49u, 0xa4c3349fu, + 0xa382339eu, 0x3230b439u, 0x31692db1u, 0x2d4ca984u, 0x2d0832d7u, 0x2e3eadd9u, 0x346c2e99u, 0xa9c1acf5u, + 0x1b1531feu, 0xa630308bu, 0x30b5b176u, 0xb1cab315u, 0x30bf2cdfu, 0xb05e3394u, 0x3461b379u, 0xb1ceb2a3u, + 0x2484b002u, 0x3101335au, 0xb23d2ffeu, 0xa2a2ad77u, 0xb05ab2c5u, 0xb328af5cu, 0x2fe027b8u, 0xa1cdb421u, + 0x30c3316bu, 0xa7f3320du, 0xb0fa3228u, 0xb237ad58u, 0xac25a4ffu, 0x30d6b0f9u, 0xb05daf26u, 0xa8b3b316u, + 0xad05af91u, 0x310db112u, 0x95153421u, 0x330d2d7eu, 0x2c5c34cau, 0x33d8337cu, 0xa6003199u, 0xb4a6b405u, + 0x9afca301u, 0x293cab1du, 0x32772802u, 0xb147b384u, 0x32ae31a0u, 0x332934c0u, 0xaf21337cu, 0xb074ab8du, + 0x34362c15u, 0x2861adf7u, 0xb257afd2u, 0xb43eb1dau, 0xab753450u, 0x2ef7326au, 0xb23bb1eau, 0xb08431fdu, + 0xb2e933efu, 0x320a2ed3u, 0xb04cb115u, 0xb0853476u, 0xa92fa961u, 0x314cb0beu, 0xacac3138u, 0xb182326fu, + 0x2977b4c2u, 0xb46e34beu, 0x21d0b19bu, 0xb3e431f3u, 0x30beac06u, 0xb1f9acefu, 0xaae9b1d9u, 0x32f2a253u, + 0x2f502d76u, 0xaac534a3u, 0x3410ab6cu, 0x306ab0e1u, 0xb3b930e3u, 0xb1c0a4e1u, 0xa58eb20au, 0x31afb00bu, + 0x31d033d3u, 0xb056b07eu, 0xb302b0b6u, 0xab863443u, 0x32a8b149u, 0xb476abbeu, 0x2d59b27eu, 0xb18d316fu, + 0x32bd3402u, 0xb13b32b0u, 0x330eb451u, 0x2c91af75u, 0xb32330feu, 0x34363164u, 0xa8843063u, 0x2438333bu, + 0x322730b3u, 0xb01330cau, 0xb106b356u, 0x1a7c1711u, 0xac072a59u, 0x33ef3182u, 0x308bb0ddu, 0x23182ed6u, + 0x31abb1fbu, 0x3042346cu, 0x34a63403u, 0x2d4fadaeu, 0x34aca401u, 0xaabbb3cdu, 0xb0253336u, 0xac803187u, + 0x335d349cu, 0x3130afa1u, 0x2c16ae44u, 0x32542e16u, 0xb15fb491u, 0x28e13012u, 0xb4459967u, 0x3303add4u, + 0xabf4b481u, 0xb4503446u, 0xb0a730cbu, 0x30b9330au, 0xb1743368u, 0x315d24ceu, 0xb12bb0e2u, 0x3467b48au, + 0x2d353008u, 0x24db30afu, 0xa7c0a355u, 0xb1bb2d65u, 0x31fc3373u, 0x2a0aadc7u, 0x3489b2e4u, 0x340d33bcu, + 0xb25aae72u, 0xb3dfb189u, 0xb41eb34bu, 0xb41734b4u, 0x3249b30eu, 0x31452a2au, 0x33562d07u, 0x22cc2d51u, + 0xa80a3016u, 0xa4f8ade9u, 0x31b2b257u, 0xb2a7b098u, 0x3291b0dfu, 0x2f25ae85u, 0xb40e32d8u, 0xaf633117u, + 0x2f41b171u, 0xafdf33deu, 0xb0092949u, 0x2cae282bu, 0x2cbf2c3du, 0x2f6fb302u, 0x2f032eb6u, 0xa815b417u, + 0xb3efaafcu, 0xa889b155u, 0xb10c30aau, 0x27093255u, 0xa4373100u, 0x2c998fecu, 0x9d08341cu, 0x2bd52467u, + 0xb41c308fu, 0xa1a820d2u, 0x9bfb340cu, 0x3470a4deu, 0xb221a797u, 0xb3c6b16fu, 0xa93ab4cbu, 0x307c312bu, + 0x2960b47eu, 0x343cb2a3u, 0x2cddb263u, 0x2d70aeacu, 0x30fcaabdu, 0x33cbb12cu, 0x30eb081fu, 0x34ca32b7u, + 0xb017b14du, 0xb08eb0cbu, 0x34bd2a55u, 0x241d34b5u, 0x32e92d57u, 0xb1b03428u, 0x32642462u, 0xa89faf8fu, + 0x34a834a9u, 0x31bb33ccu, 0xaee1b06fu, 0x28ab2743u, 0x2d401ac3u, 0x30c9ab0au, 0xae81ae64u, 0xb1fab328u, + 0xa82032bbu, 0x30fbb13au, 0xb4992825u, 0xb234af0du, 0xb0522dfcu, 0xb446b42du, 0xb4972a47u, 0xb29e32b2u, + 0xa83f2c18u, 0xb41ca864u, 0x338c31d0u, 0xb22cb4b2u, 0x279a33c1u, 0xb1b5b2b8u, 0x30512e25u, 0x345a2ba3u, + 0xafab9b4bu, 0xad64a2feu, 0xb45cb14bu, 0x300fadadu, 0xa8acb49fu, 0x2c3d2d88u, 0x31f63150u, 0xb3a03011u, + 0x2bf1a3acu, 0xb464b0e3u, 0xa6eeb14fu, 0xb235aa9cu, 0x3416323bu, 0x3420b1bcu, 0x3414b4a1u, 0xb4af3457u, + 0x3484310du, 0x348533cbu, 0xb40d27bbu, 0x2c5f32b7u, 0xaa5b2c68u, 0xb2a72984u, +}; + +// 256 uint16 values (raw f16 bits) +static const uint16_t kCnnV3ExpectedEnc0U16[256] = { + 0x0000u, 0x0000u, 0x350cu, 0x3b3cu, 0x19bcu, 0x0000u, 0x0000u, 0x3d10u, + 0x31e9u, 0x0000u, 0x35d0u, 0x39c3u, 0x0000u, 0x0000u, 0x2c6fu, 0x35fbu, + 0x39b9u, 0x0000u, 0x0000u, 0x3538u, 0x2ebbu, 0x0000u, 0x34f8u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x3c96u, 0x0000u, 0x3029u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x405au, 0x0000u, 0x367eu, 0x0000u, 0x3d2fu, + 0x383bu, 0x0000u, 0x342cu, 0x3f97u, 0x0000u, 0x3c3cu, 0x0000u, 0x424eu, + 0x0000u, 0x0000u, 0x0000u, 0x3a3au, 0x0000u, 0x3d8fu, 0x0000u, 0x3fd4u, + 0x307du, 0x0000u, 0x0000u, 0x3f68u, 0x0000u, 0x0000u, 0x0000u, 0x3c81u, + 0x0000u, 0x0000u, 0x398fu, 0x3ffeu, 0x0000u, 0x0000u, 0x0000u, 0x3ec1u, + 0x0000u, 0x39b8u, 0x0000u, 0x3c61u, 0x0000u, 0x2e3au, 0x3699u, 0x41deu, + 0x0000u, 0x0000u, 0x0000u, 0x3d2cu, 0x329au, 0x0000u, 0x0000u, 0x41a9u, + 0x2d70u, 0x342fu, 0x0000u, 0x4066u, 0x2c77u, 0x0000u, 0x37b7u, 0x3842u, + 0x2b9au, 0x0000u, 0x3655u, 0x4001u, 0x340au, 0x0000u, 0x30f5u, 0x41a5u, + 0x0000u, 0x0000u, 0x0000u, 0x3d05u, 0x0000u, 0x0000u, 0x30a6u, 0x40a3u, + 0x0000u, 0x0000u, 0x0000u, 0x4263u, 0x0000u, 0x0000u, 0x0000u, 0x3e62u, + 0x0000u, 0x0000u, 0x0000u, 0x42d7u, 0x0000u, 0x0000u, 0x0000u, 0x3de8u, + 0x0000u, 0x0000u, 0x0000u, 0x3f4du, 0x0000u, 0x38d4u, 0x3a61u, 0x3fb7u, + 0x0000u, 0x0000u, 0x0000u, 0x404cu, 0x3811u, 0x31a4u, 0x0000u, 0x3edfu, + 0x0000u, 0x0000u, 0x0000u, 0x3f30u, 0x0000u, 0x0000u, 0x0000u, 0x3ec7u, + 0x27dau, 0x0000u, 0x0000u, 0x3efeu, 0x0000u, 0x3027u, 0x0000u, 0x39ceu, + 0x28e8u, 0x0000u, 0x0000u, 0x4121u, 0x0000u, 0x0000u, 0x0000u, 0x40eeu, + 0x3b70u, 0x3379u, 0x0000u, 0x40d3u, 0x0000u, 0x0000u, 0x0000u, 0x3d88u, + 0x329du, 0x0000u, 0x0000u, 0x3fafu, 0x35c0u, 0x0000u, 0x374cu, 0x40ceu, + 0x32b4u, 0x2c9au, 0x0000u, 0x4094u, 0x3105u, 0x31f4u, 0x34e9u, 0x3cd7u, + 0x0000u, 0x0000u, 0x344bu, 0x3cd1u, 0x0000u, 0x2d13u, 0x0000u, 0x3e7eu, + 0x0000u, 0x2eacu, 0x0000u, 0x4123u, 0x0000u, 0x36edu, 0x0000u, 0x3c69u, + 0x0000u, 0x0000u, 0x0000u, 0x41d5u, 0x0000u, 0x36e4u, 0x0000u, 0x4049u, + 0x0000u, 0x0000u, 0x0000u, 0x401du, 0x0000u, 0x38d1u, 0x333au, 0x3b08u, + 0x0000u, 0x0000u, 0x0000u, 0x3d12u, 0x0000u, 0x0000u, 0x0000u, 0x3e6eu, + 0x0000u, 0x0000u, 0x0000u, 0x4028u, 0x0000u, 0x0000u, 0x0000u, 0x3f64u, + 0x0000u, 0x0000u, 0x0000u, 0x3e4bu, 0x2eeau, 0x393cu, 0x0000u, 0x4007u, + 0x0000u, 0x267fu, 0x0000u, 0x3eabu, 0x35b4u, 0x38f9u, 0x0000u, 0x3e6bu, +}; + +// kCnnV3Dec1HW = (W/2) x (H/2) = 4 x 4 +// 64 uint16 values (raw f16 bits) +static const uint16_t kCnnV3ExpectedDec1U16[64] = { + 0x0000u, 0x2692u, 0x3823u, 0x397eu, 0x0000u, 0x22dcu, 0x35dcu, 0x35f9u, + 0x0000u, 0x3936u, 0x24b5u, 0x3434u, 0x0000u, 0x3b63u, 0x0000u, 0x32fcu, + 0x0000u, 0x2913u, 0x3523u, 0x33d6u, 0x0000u, 0x3023u, 0x2575u, 0x0000u, + 0x0000u, 0x39edu, 0x0000u, 0x0000u, 0x0000u, 0x3c91u, 0x0000u, 0x0000u, + 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, 0x0000u, + 0x0000u, 0x3754u, 0x0000u, 0x0000u, 0x318cu, 0x3a4du, 0x0000u, 0x0000u, + 0x3206u, 0x32deu, 0x0000u, 0x0000u, 0x317du, 0x3437u, 0x0000u, 0x0000u, + 0x312au, 0x357fu, 0x0000u, 0x0000u, 0x0000u, 0x39b5u, 0x0000u, 0x0000u, +}; + +// 256 uint16 values (raw f16 bits) +static const uint16_t kCnnV3ExpectedOutputU16[256] = { + 0x3800u, 0x3934u, 0x3800u, 0x38aau, 0x384au, 0x3800u, 0x3800u, 0x3917u, + 0x38d5u, 0x3800u, 0x3800u, 0x38f2u, 0x3800u, 0x38c9u, 0x3800u, 0x38d4u, + 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x38dau, 0x3800u, 0x3800u, + 0x3800u, 0x383eu, 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x3800u, 0x3800u, + 0x396du, 0x38eeu, 0x3800u, 0x3a87u, 0x3899u, 0x3800u, 0x3800u, 0x3972u, + 0x3a4au, 0x3800u, 0x3800u, 0x3847u, 0x386du, 0x3800u, 0x3800u, 0x3a70u, + 0x3800u, 0x381fu, 0x3800u, 0x3800u, 0x3800u, 0x3945u, 0x3800u, 0x392eu, + 0x3800u, 0x3800u, 0x3800u, 0x3844u, 0x3800u, 0x3800u, 0x3820u, 0x3800u, + 0x3a6du, 0x3832u, 0x3800u, 0x3ab0u, 0x3909u, 0x3800u, 0x3800u, 0x3a12u, + 0x3873u, 0x3800u, 0x3800u, 0x39b8u, 0x3a9au, 0x3800u, 0x3800u, 0x3a41u, + 0x3800u, 0x3800u, 0x3800u, 0x38d0u, 0x3952u, 0x3800u, 0x3800u, 0x398cu, + 0x3800u, 0x3800u, 0x3800u, 0x3a21u, 0x3800u, 0x3800u, 0x3800u, 0x3800u, + 0x3950u, 0x3800u, 0x3800u, 0x3abdu, 0x39ccu, 0x3800u, 0x3800u, 0x39e0u, + 0x3800u, 0x3800u, 0x3800u, 0x3a62u, 0x38d7u, 0x3800u, 0x3800u, 0x3a23u, + 0x3858u, 0x3800u, 0x3800u, 0x39f8u, 0x3800u, 0x3800u, 0x3800u, 0x3a01u, + 0x38e7u, 0x3800u, 0x3800u, 0x3822u, 0x38fcu, 0x3800u, 0x3832u, 0x3800u, + 0x3840u, 0x383au, 0x3800u, 0x3b39u, 0x390du, 0x3800u, 0x3800u, 0x399bu, + 0x3800u, 0x3800u, 0x3800u, 0x39c2u, 0x3802u, 0x3800u, 0x3800u, 0x3a41u, + 0x398bu, 0x3800u, 0x3800u, 0x39fau, 0x3800u, 0x3800u, 0x3800u, 0x396au, + 0x38d3u, 0x3800u, 0x3800u, 0x3888u, 0x3909u, 0x3800u, 0x3800u, 0x3800u, + 0x3863u, 0x3800u, 0x3800u, 0x3ae8u, 0x3a06u, 0x3800u, 0x3800u, 0x3a7du, + 0x38c1u, 0x3800u, 0x3800u, 0x3a20u, 0x38cdu, 0x3800u, 0x3800u, 0x390cu, + 0x3820u, 0x3800u, 0x3800u, 0x39d5u, 0x3863u, 0x3800u, 0x3800u, 0x389cu, + 0x3800u, 0x3800u, 0x3800u, 0x38bcu, 0x3887u, 0x3800u, 0x3866u, 0x3800u, + 0x38bbu, 0x3800u, 0x3800u, 0x3a8du, 0x394cu, 0x3800u, 0x3800u, 0x39b9u, + 0x394au, 0x3800u, 0x3800u, 0x3977u, 0x3800u, 0x3800u, 0x3800u, 0x3906u, + 0x3800u, 0x3800u, 0x386bu, 0x3a02u, 0x38bbu, 0x3800u, 0x3800u, 0x39d7u, + 0x38a2u, 0x3800u, 0x3800u, 0x3800u, 0x3899u, 0x3800u, 0x3811u, 0x3800u, + 0x3830u, 0x3800u, 0x387au, 0x3918u, 0x386au, 0x3800u, 0x38acu, 0x39f0u, + 0x39c7u, 0x3800u, 0x38beu, 0x3988u, 0x38c3u, 0x3800u, 0x3930u, 0x39d5u, + 0x397bu, 0x3800u, 0x3918u, 0x3a09u, 0x394cu, 0x3800u, 0x3952u, 0x3961u, + 0x3980u, 0x3800u, 0x392eu, 0x3872u, 0x39c2u, 0x3800u, 0x3903u, 0x3800u, +}; + diff --git a/cnn_v3/training/gen_test_vectors.py b/cnn_v3/training/gen_test_vectors.py new file mode 100644 index 0000000..640971c --- /dev/null +++ b/cnn_v3/training/gen_test_vectors.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python3 +# CNN v3 parity reference — numpy forward pass matching WGSL shaders exactly. +# Generates test vectors for C++ GPU parity validation. +# +# Usage: +# python3 cnn_v3/training/gen_test_vectors.py # self-test only +# python3 cnn_v3/training/gen_test_vectors.py --header # emit C header to stdout + +import numpy as np +import struct +import sys +import argparse + +# --------------------------------------------------------------------------- +# Weight layout (f16 units, matching C++ cnn_v3_effect.cc constants) +# --------------------------------------------------------------------------- + +ENC0_IN, ENC0_OUT = 20, 4 +ENC1_IN, ENC1_OUT = 4, 8 +BN_IN, BN_OUT = 8, 8 +DEC1_IN, DEC1_OUT = 16, 4 +DEC0_IN, DEC0_OUT = 8, 4 + +ENC0_WEIGHTS = ENC0_IN * ENC0_OUT * 9 + ENC0_OUT # 724 +ENC1_WEIGHTS = ENC1_IN * ENC1_OUT * 9 + ENC1_OUT # 296 +BN_WEIGHTS = BN_IN * BN_OUT * 1 + BN_OUT # 72 +DEC1_WEIGHTS = DEC1_IN * DEC1_OUT * 9 + DEC1_OUT # 580 +DEC0_WEIGHTS = DEC0_IN * DEC0_OUT * 9 + DEC0_OUT # 292 + +ENC0_OFFSET = 0 +ENC1_OFFSET = ENC0_OFFSET + ENC0_WEIGHTS +BN_OFFSET = ENC1_OFFSET + ENC1_WEIGHTS +DEC1_OFFSET = BN_OFFSET + BN_WEIGHTS +DEC0_OFFSET = DEC1_OFFSET + DEC1_WEIGHTS +TOTAL_F16 = DEC0_OFFSET + DEC0_WEIGHTS # 1964 + 292 = 2256? let me check +# 724 + 296 + 72 + 580 + 292 = 1964 ... actually let me recount +# ENC0: 20*4*9 + 4 = 720+4 = 724 +# ENC1: 4*8*9 + 8 = 288+8 = 296 +# BN: 8*8*1 + 8 = 64+8 = 72 +# DEC1: 16*4*9 + 4 = 576+4 = 580 +# DEC0: 8*4*9 + 4 = 288+4 = 292 +# Total = 724+296+72+580+292 = 1964 ... but HOWTO.md says 2064. Let me recheck. +# DEC1: 16*4*9 = 576 ... but the shader says Conv(16->4) which is IN=16, OUT=4 +# weight idx: o * DEC1_IN * 9 + i * 9 + ki where o<DEC1_OUT, i<DEC1_IN +# So total conv weights = DEC1_OUT * DEC1_IN * 9 = 4*16*9 = 576, bias = 4 +# Total DEC1 = 580. OK that's right. +# Let me add: 724+296+72+580+292 = 1964. But HOWTO says 2064? +# DEC1: Conv(16->4) = OUT*IN*K^2 = 4*16*9 = 576 + bias 4 = 580. HOWTO says 576+4=580 OK. +# Total = 724+296+72+580+292 = let me sum: 724+296=1020, +72=1092, +580=1672, +292=1964. +# Hmm, HOWTO.md says 2064. Let me recheck HOWTO weight table: +# enc0: 20*4*9=720 +4 = 724 +# enc1: 4*8*9=288 +8 = 296 +# bottleneck: 8*8*1=64 +8 = 72 +# dec1: 16*4*9=576 +4 = 580 +# dec0: 8*4*9=288 +4 = 292 +# Total = 724+296+72+580+292 = 1964 +# The HOWTO says 2064 but I get 1964... 100 difference. Possible typo in doc. +# I'll use the correct value derived from the formulas: 1964. + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def get_w(w_f32, base, idx): + """Read one f16-precision weight. Matches WGSL get_w().""" + return float(w_f32[base + idx]) + + +# --------------------------------------------------------------------------- +# Layer forward passes — each matches the corresponding WGSL compute shader +# --------------------------------------------------------------------------- + +def enc0_forward(feat0, feat1, w, gamma, beta): + """ + Conv(20->4, 3x3, zero-pad) + FiLM + ReLU → rgba16float (f16 stored). + feat0: (H, W, 8) f32 — channels from unpack2x16float(feat_tex0) + feat1: (H, W, 12) f32 — channels from unpack4x8unorm(feat_tex1) + gamma, beta: (ENC0_OUT,) f32 — FiLM params + Returns: (H, W, 4) f32 — f16 precision (rgba16float texture boundary) + """ + H, W = feat0.shape[:2] + wo = ENC0_OFFSET + feat = np.concatenate([feat0, feat1], axis=2) # (H, W, 20) + fp = np.pad(feat, ((1, 1), (1, 1), (0, 0)), mode='constant') # zero-pad + + out = np.zeros((H, W, ENC0_OUT), dtype=np.float32) + for o in range(ENC0_OUT): + bias = get_w(w, wo, ENC0_OUT * ENC0_IN * 9 + o) + s = np.full((H, W), bias, dtype=np.float32) + for i in range(ENC0_IN): + for ky in range(3): + for kx in range(3): + wv = get_w(w, wo, o * ENC0_IN * 9 + i * 9 + ky * 3 + kx) + s += wv * fp[ky:ky+H, kx:kx+W, i] + out[:, :, o] = np.maximum(0.0, gamma[o] * s + beta[o]) + + return np.float16(out).astype(np.float32) # rgba16float texture boundary + + +def enc1_forward(enc0, w, gamma_lo, gamma_hi, beta_lo, beta_hi): + """ + AvgPool2x2(enc0, clamp-border) + Conv(4->8, 3x3, zero-pad) + FiLM + ReLU + → rgba32uint (pack2x16float, f16 precision, half-res). + enc0: (H, W, 4) f32 — rgba16float precision + """ + H, W = enc0.shape[:2] + hH, hW = H // 2, W // 2 + wo = ENC1_OFFSET + + # AvgPool2x2 with clamp at borders (matches load_enc0_avg in WGSL) + avg = np.zeros((hH, hW, ENC1_IN), dtype=np.float32) + for hy in range(hH): + for hx in range(hW): + s = np.zeros(ENC1_IN, dtype=np.float32) + for dy in range(2): + for dx in range(2): + fy = min(hy * 2 + dy, H - 1) + fx = min(hx * 2 + dx, W - 1) + s += enc0[fy, fx, :] + avg[hy, hx, :] = s * 0.25 + + # 3x3 conv with zero-padding at half-res borders + ap = np.pad(avg, ((1, 1), (1, 1), (0, 0)), mode='constant') + gamma = np.concatenate([gamma_lo, gamma_hi]) + beta = np.concatenate([beta_lo, beta_hi]) + + out = np.zeros((hH, hW, ENC1_OUT), dtype=np.float32) + for o in range(ENC1_OUT): + bias = get_w(w, wo, ENC1_OUT * ENC1_IN * 9 + o) + s = np.full((hH, hW), bias, dtype=np.float32) + for i in range(ENC1_IN): + for ky in range(3): + for kx in range(3): + wv = get_w(w, wo, o * ENC1_IN * 9 + i * 9 + ky * 3 + kx) + s += wv * ap[ky:ky+hH, kx:kx+hW, i] + out[:, :, o] = np.maximum(0.0, gamma[o] * s + beta[o]) + + return np.float16(out).astype(np.float32) # pack2x16float boundary + + +def bottleneck_forward(enc1, w): + """ + AvgPool2x2(enc1, clamp-border) + Conv(8->8, 1x1) + ReLU + → rgba32uint (f16, quarter-res). No FiLM. + enc1: (hH, hW, 8) f32 — half-res + """ + hH, hW = enc1.shape[:2] + qH, qW = hH // 2, hW // 2 + wo = BN_OFFSET + + # AvgPool2x2 with clamp (matches load_enc1_avg in WGSL) + avg = np.zeros((qH, qW, BN_IN), dtype=np.float32) + for qy in range(qH): + for qx in range(qW): + s = np.zeros(BN_IN, dtype=np.float32) + for dy in range(2): + for dx in range(2): + hy = min(qy * 2 + dy, hH - 1) + hx = min(qx * 2 + dx, hW - 1) + s += enc1[hy, hx, :] + avg[qy, qx, :] = s * 0.25 + + # 1x1 conv (no spatial loop, just channel dot-product) + out = np.zeros((qH, qW, BN_OUT), dtype=np.float32) + for o in range(BN_OUT): + bias = get_w(w, wo, BN_OUT * BN_IN + o) + s = np.full((qH, qW), bias, dtype=np.float32) + for i in range(BN_IN): + wv = get_w(w, wo, o * BN_IN + i) + s += wv * avg[:, :, i] + out[:, :, o] = np.maximum(0.0, s) + + return np.float16(out).astype(np.float32) # pack2x16float boundary + + +def dec1_forward(bn, enc1, w, gamma, beta): + """ + NearestUp2x(bn) + cat(enc1_skip) → Conv(16->4, 3x3, zero-pad) + FiLM + ReLU + → rgba16float (half-res). + bn: (qH, qW, 8) f32 — quarter-res bottleneck + enc1: (hH, hW, 8) f32 — half-res skip connection + """ + hH, hW = enc1.shape[:2] + qH, qW = bn.shape[:2] + wo = DEC1_OFFSET + + # Build 16-channel input: [nearest_up(bn), enc1_skip], zero-padded for 3x3 + # load_dec1_concat: if OOB → zeros; otherwise nearest_up + enc1 + fp = np.zeros((hH + 2, hW + 2, DEC1_IN), dtype=np.float32) + for hy in range(hH): + for hx in range(hW): + qy = min(hy // 2, qH - 1) + qx = min(hx // 2, qW - 1) + fp[hy + 1, hx + 1, :] = np.concatenate([bn[qy, qx, :], enc1[hy, hx, :]]) + + out = np.zeros((hH, hW, DEC1_OUT), dtype=np.float32) + for o in range(DEC1_OUT): + bias = get_w(w, wo, DEC1_OUT * DEC1_IN * 9 + o) + s = np.full((hH, hW), bias, dtype=np.float32) + for i in range(DEC1_IN): + for ky in range(3): + for kx in range(3): + wv = get_w(w, wo, o * DEC1_IN * 9 + i * 9 + ky * 3 + kx) + s += wv * fp[ky:ky+hH, kx:kx+hW, i] + out[:, :, o] = np.maximum(0.0, gamma[o] * s + beta[o]) + + return np.float16(out).astype(np.float32) # rgba16float boundary + + +def dec0_forward(dec1, enc0, w, gamma, beta): + """ + NearestUp2x(dec1) + cat(enc0_skip) → Conv(8->4, 3x3, zero-pad) + FiLM + ReLU + sigmoid + → rgba16float (full-res, final output). + dec1: (hH, hW, 4) f32 — half-res + enc0: (H, W, 4) f32 — full-res enc0 skip + """ + H, W = enc0.shape[:2] + hH, hW = dec1.shape[:2] + wo = DEC0_OFFSET + + # Build 8-channel input: [nearest_up(dec1), enc0_skip], zero-padded + fp = np.zeros((H + 2, W + 2, DEC0_IN), dtype=np.float32) + for y in range(H): + for x in range(W): + hy = min(y // 2, hH - 1) + hx = min(x // 2, hW - 1) + fp[y + 1, x + 1, :] = np.concatenate([dec1[hy, hx, :], enc0[y, x, :]]) + + out = np.zeros((H, W, DEC0_OUT), dtype=np.float32) + for o in range(DEC0_OUT): + bias = get_w(w, wo, DEC0_OUT * DEC0_IN * 9 + o) + s = np.full((H, W), bias, dtype=np.float32) + for i in range(DEC0_IN): + for ky in range(3): + for kx in range(3): + wv = get_w(w, wo, o * DEC0_IN * 9 + i * 9 + ky * 3 + kx) + s += wv * fp[ky:ky+H, kx:kx+W, i] + # FiLM + ReLU + sigmoid (matches WGSL dec0 shader) + v = np.maximum(0.0, gamma[o] * s + beta[o]) + out[:, :, o] = 1.0 / (1.0 + np.exp(-v.astype(np.float64))).astype(np.float32) + + return np.float16(out).astype(np.float32) # rgba16float boundary + + +def forward_pass(feat0, feat1, w_f32, film): + """Full U-Net forward pass. film is a dict of gamma/beta arrays.""" + enc0 = enc0_forward(feat0, feat1, w_f32, + film['enc0_gamma'], film['enc0_beta']) + enc1 = enc1_forward(enc0, w_f32, + film['enc1_gamma_lo'], film['enc1_gamma_hi'], + film['enc1_beta_lo'], film['enc1_beta_hi']) + bn = bottleneck_forward(enc1, w_f32) + dc1 = dec1_forward(bn, enc1, w_f32, film['dec1_gamma'], film['dec1_beta']) + dc0 = dec0_forward(dc1, enc0, w_f32, film['dec0_gamma'], film['dec0_beta']) + return dc0 + + +def identity_film(): + return { + 'enc0_gamma': np.ones(ENC0_OUT, dtype=np.float32), + 'enc0_beta': np.zeros(ENC0_OUT, dtype=np.float32), + 'enc1_gamma_lo': np.ones(4, dtype=np.float32), + 'enc1_gamma_hi': np.ones(4, dtype=np.float32), + 'enc1_beta_lo': np.zeros(4, dtype=np.float32), + 'enc1_beta_hi': np.zeros(4, dtype=np.float32), + 'dec1_gamma': np.ones(DEC1_OUT, dtype=np.float32), + 'dec1_beta': np.zeros(DEC1_OUT, dtype=np.float32), + 'dec0_gamma': np.ones(DEC0_OUT, dtype=np.float32), + 'dec0_beta': np.zeros(DEC0_OUT, dtype=np.float32), + } + + +# --------------------------------------------------------------------------- +# Self-test: zero weights → output must be exactly 0.5 +# --------------------------------------------------------------------------- + +def test_zero_weights(): + H, W = 8, 8 + w = np.zeros(TOTAL_F16, dtype=np.float32) + feat0 = np.zeros((H, W, 8), dtype=np.float32) + feat1 = np.zeros((H, W, 12), dtype=np.float32) + out = forward_pass(feat0, feat1, w, identity_film()) + max_err = float(np.max(np.abs(out - 0.5))) + ok = max_err < 1e-5 + print(f"[test_zero_weights] max_err={max_err:.2e} {'OK' if ok else 'FAIL'}", + file=sys.stderr) + return ok + + +# --------------------------------------------------------------------------- +# Test vector generation and C header emission +# --------------------------------------------------------------------------- + +def pack_feat0_rgba32uint(feat0_f32, H, W): + """Pack (H*W, 8) f16-precision values as H*W*4 u32 (pack2x16float layout).""" + f16 = np.float16(feat0_f32.reshape(H * W, 8)) + u16 = f16.view(np.uint16) # (H*W, 8) u16 + u32 = np.zeros((H * W, 4), dtype=np.uint32) + for j in range(4): + u32[:, j] = u16[:, j*2].astype(np.uint32) | (u16[:, j*2+1].astype(np.uint32) << 16) + return u32.flatten() # H*W*4 u32 + + +def pack_feat1_rgba32uint(feat1_u8, H, W): + """Pack (H*W, 12) u8 values as H*W*4 u32 (pack4x8unorm, 4th u32 = 0).""" + u8 = feat1_u8.reshape(H * W, 12) + u32 = np.zeros((H * W, 4), dtype=np.uint32) + for j in range(3): + for b in range(4): + u32[:, j] |= u8[:, j*4+b].astype(np.uint32) << (b * 8) + return u32.flatten() # H*W*4 u32 + + +def pack_weights_u32(w_f16): + """Pack flat f16 array as u32 pairs matching WGSL get_w() layout.""" + # Pad to even count + if len(w_f16) % 2: + w_f16 = np.append(w_f16, np.float16(0)) + u16 = w_f16.view(np.uint16) + u32 = u16[::2].astype(np.uint32) | (u16[1::2].astype(np.uint32) << 16) + return u32 + + +def generate_vectors(W=8, H=8, seed=42): + rng = np.random.default_rng(seed) + + # Random f16 weights (small range to avoid NaN/Inf cascading) + w_f16 = rng.uniform(-0.3, 0.3, TOTAL_F16).astype(np.float16) + w_f32 = w_f16.astype(np.float32) + + # Random feat0: 8 f16-precision channels + feat0_f16 = rng.uniform(0.0, 1.0, (H, W, 8)).astype(np.float16) + feat0 = feat0_f16.astype(np.float32) + + # Random feat1: 12 u8 channels (unpacked as unorm [0,1]) + feat1_u8 = rng.integers(0, 256, (H, W, 12), dtype=np.uint8) + feat1 = feat1_u8.astype(np.float32) / 255.0 + + film = identity_film() + enc0 = enc0_forward(feat0, feat1, w_f32, + film['enc0_gamma'], film['enc0_beta']) + enc1 = enc1_forward(enc0, w_f32, + film['enc1_gamma_lo'], film['enc1_gamma_hi'], + film['enc1_beta_lo'], film['enc1_beta_hi']) + bn = bottleneck_forward(enc1, w_f32) + dc1 = dec1_forward(bn, enc1, w_f32, film['dec1_gamma'], film['dec1_beta']) + out = dec0_forward(dc1, enc0, w_f32, film['dec0_gamma'], film['dec0_beta']) + + feat0_u32 = pack_feat0_rgba32uint(feat0, H, W) + feat1_u32 = pack_feat1_rgba32uint(feat1_u8, H, W) + w_u32 = pack_weights_u32(w_f16) + enc0_u16 = np.float16(enc0.reshape(-1)).view(np.uint16) + # dec1 is half-res (hH x hW x 4); store as-is + dc1_u16 = np.float16(dc1.reshape(-1)).view(np.uint16) + out_u16 = np.float16(out.reshape(-1)).view(np.uint16) # raw f16 bits + + return { + 'W': W, 'H': H, 'seed': seed, + 'feat0_u32': feat0_u32, + 'feat1_u32': feat1_u32, + 'w_u32': w_u32, + 'enc0_u16': enc0_u16, + 'dc1_u16': dc1_u16, + 'out_u16': out_u16, + 'out_f32': out.reshape(-1), + } + + +def emit_c_header(v): + lines = [] + lines.append("// Auto-generated by cnn_v3/training/gen_test_vectors.py") + lines.append(f"// Seed={v['seed']} W={v['W']} H={v['H']}") + lines.append("// DO NOT EDIT — regenerate with gen_test_vectors.py --header") + lines.append("#pragma once") + lines.append("#include <cstdint>") + lines.append("") + lines.append(f"static const int kCnnV3TestW = {v['W']};") + lines.append(f"static const int kCnnV3TestH = {v['H']};") + lines.append("") + + def array_u32(name, data): + lines.append(f"// {len(data)} u32 values") + lines.append(f"static const uint32_t {name}[{len(data)}] = {{") + row = [] + for i, x in enumerate(data): + row.append(f"0x{int(x):08x}u") + if len(row) == 8 or i == len(data) - 1: + lines.append(" " + ", ".join(row) + ",") + row = [] + lines.append("};") + lines.append("") + + def array_u16(name, data): + lines.append(f"// {len(data)} uint16 values (raw f16 bits)") + lines.append(f"static const uint16_t {name}[{len(data)}] = {{") + row = [] + for i, x in enumerate(data): + row.append(f"0x{int(x):04x}u") + if len(row) == 8 or i == len(data) - 1: + lines.append(" " + ", ".join(row) + ",") + row = [] + lines.append("};") + lines.append("") + + array_u32("kCnnV3TestFeat0U32", v['feat0_u32']) + array_u32("kCnnV3TestFeat1U32", v['feat1_u32']) + array_u32("kCnnV3TestWeightsU32", v['w_u32']) + array_u16("kCnnV3ExpectedEnc0U16", v['enc0_u16']) + lines.append(f"// kCnnV3Dec1HW = (W/2) x (H/2) = {v['W']//2} x {v['H']//2}") + array_u16("kCnnV3ExpectedDec1U16", v['dc1_u16']) + array_u16("kCnnV3ExpectedOutputU16", v['out_u16']) + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser(description="CNN v3 parity test vector generator") + parser.add_argument('--header', action='store_true', + help='Emit C header to stdout') + parser.add_argument('--W', type=int, default=8) + parser.add_argument('--H', type=int, default=8) + parser.add_argument('--seed', type=int, default=42) + args = parser.parse_args() + + # Send self-test output to stderr so --header stdout stays clean + import io + log = sys.stderr if args.header else sys.stdout + + ok = test_zero_weights() + if not ok: + sys.exit(1) + + if args.header: + v = generate_vectors(args.W, args.H, args.seed) + print(emit_c_header(v)) # C header → stdout only + print("All checks passed.", file=log) + else: + v = generate_vectors(args.W, args.H, args.seed) + out = v['out_f32'] + print(f"[gen_test_vectors] W={args.W} H={args.H} seed={args.seed}") + print(f" output range: [{float(out.min()):.4f}, {float(out.max()):.4f}]") + print(f" output mean: {float(out.mean()):.4f}") + print(" Run with --header to emit C header for C++ parity test.") + print("All checks passed.") + + +if __name__ == '__main__': + main() diff --git a/doc/COMPLETED.md b/doc/COMPLETED.md index 6868484..7b925f0 100644 --- a/doc/COMPLETED.md +++ b/doc/COMPLETED.md @@ -36,6 +36,8 @@ Completed task archive. See `doc/archive/` for detailed historical documents. ## March 2026 +- [x] **CNN v3 Phase 5: Parity validation** — `test_cnn_v3_parity.cc` (2 tests: zero_weights, random_weights). Root cause: intermediate nodes declared at full res instead of W/2, W/4. Fix: `NodeRegistry::default_width()/default_height()` getters + fractional resolution in `declare_nodes()`. Final max_err=4.88e-4 ✓. 36/36 tests. + - [x] **CNN v3 Phase 4: C++ CNNv3Effect + FiLM uniform upload** — `cnn_v3/src/cnn_v3_effect.{h,cc}`. 5 compute passes (enc0→enc1→bottleneck→dec1→dec0), shared f16 weights buffer, per-pass uniform buffers, `set_film_params()` API. Key fix: WGSL `vec3u` has align=16, so `CnnV3Params4ch`=64B and `CnnV3ParamsEnc1`=96B (not 48/80). Weight offsets as explicit formulas. FiLM γ/β identity defaults; real values await `train_cnn_v3.py`. 35/35 tests. - [x] **NTSC post-process effect** — Fisheye distortion + NTSC scan-line simulation as `WgslEffect` thin wrappers. Common logic in `render/ntsc_common` snippet (`sample_ntsc_signal` hook). Two variants: `ntsc_rgb.wgsl` (RGB→YIQ internally, `Ntsc`) and `ntsc_yiq.wgsl` (YIQ passthrough, `NtscYiq`, for RotatingCube output). Files: `src/effects/ntsc_rgb.wgsl`, `ntsc_yiq.wgsl`, `src/shaders/render/ntsc_common.wgsl`, `ntsc_effect.h`. Tests: 36/36. diff --git a/src/gpu/sequence.h b/src/gpu/sequence.h index 4592082..04482fb 100644 --- a/src/gpu/sequence.h +++ b/src/gpu/sequence.h @@ -71,6 +71,9 @@ class NodeRegistry { void set_external_view(const std::string& name, WGPUTextureView view); + int default_width() const { return default_width_; } + int default_height() const { return default_height_; } + private: WGPUDevice device_; int default_width_; diff --git a/src/tests/gpu/test_cnn_v3_parity.cc b/src/tests/gpu/test_cnn_v3_parity.cc new file mode 100644 index 0000000..608decb --- /dev/null +++ b/src/tests/gpu/test_cnn_v3_parity.cc @@ -0,0 +1,370 @@ +// CNN v3 parity test: validates WGSL shaders against Python reference. +// Two checks: +// 1. Zero-weight test (deterministic): output must be sigmoid(0) = 0.5 +// 2. Random-weight test: output must match Python-generated test vectors +// (within 1/255 per pixel) + +#include "../common/webgpu_test_fixture.h" +#include "cnn_v3/src/cnn_v3_effect.h" +#include "gpu/sequence.h" +#include "../../cnn_v3/test_vectors.h" + +#include <cassert> +#include <cmath> +#include <cstdio> +#include <vector> + +// --------------------------------------------------------------------------- +// fp16 decode (matches GPU read) +// --------------------------------------------------------------------------- + +static float fp16_bits_to_f32(uint16_t h) { + uint32_t sign = (h & 0x8000u) << 16; + uint32_t exp = (h & 0x7C00u) >> 10; + uint32_t mant = (h & 0x03FFu); + if (exp == 0 && mant == 0) { + float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r; + } + if (exp == 31) { + uint32_t b = sign | 0x7F800000u | (mant << 13); + float r; __builtin_memcpy(&r, &b, 4); return r; + } + uint32_t b = sign | ((exp + 112) << 23) | (mant << 13); + float r; __builtin_memcpy(&r, &b, 4); return r; +} + +// --------------------------------------------------------------------------- +// Raw RGBA16Float readback → flat array of f32 (one per channel per pixel) +// --------------------------------------------------------------------------- + +struct MapState { bool done = false; WGPUMapAsyncStatus status; }; + +static std::vector<float> readback_rgba16float(WGPUDevice device, + WGPUQueue queue, + WGPUTexture tex, + int W, int H) { + const uint32_t bytes_per_px = 8; // 4 × f16 + const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); + const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; + const size_t buf_size = aligned_bpr * (size_t)H; + + WGPUBufferDescriptor bd = {}; + bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; + bd.size = buf_size; + WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); + + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); + WGPUTexelCopyTextureInfo src = {}; + src.texture = tex; + WGPUTexelCopyBufferInfo dst = {}; + dst.buffer = staging; + dst.layout.bytesPerRow = aligned_bpr; + dst.layout.rowsPerImage = (uint32_t)H; + WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 }; + wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent); + WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(queue, 1, &cmds); + wgpuCommandBufferRelease(cmds); + wgpuCommandEncoderRelease(enc); + wgpuDevicePoll(device, true, nullptr); + + MapState ms = {}; + WGPUBufferMapCallbackInfo mi = {}; + mi.mode = WGPUCallbackMode_AllowProcessEvents; + mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { + auto* st = (MapState*)u; + st->status = s; st->done = true; + }; + mi.userdata1 = &ms; + wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); + for (int i = 0; i < 100 && !ms.done; ++i) + wgpuDevicePoll(device, true, nullptr); + + std::vector<float> result(W * H * 4, 0.0f); + if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { + const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange( + staging, 0, buf_size); + if (mapped) { + for (int y = 0; y < H; ++y) { + const uint16_t* row = + (const uint16_t*)(mapped + (size_t)y * aligned_bpr); + for (int x = 0; x < W; ++x) { + for (int c = 0; c < 4; ++c) { + result[(y * W + x) * 4 + c] = + fp16_bits_to_f32(row[x * 4 + c]); + } + } + } + } + } + wgpuBufferUnmap(staging); + wgpuBufferRelease(staging); + return result; +} + +// --------------------------------------------------------------------------- +// Helper: create rgba32uint texture with TextureBinding | CopyDst +// --------------------------------------------------------------------------- + +static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) { + WGPUTextureDescriptor d = {}; + d.format = WGPUTextureFormat_RGBA32Uint; + d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; + d.dimension = WGPUTextureDimension_2D; + d.size = { (uint32_t)W, (uint32_t)H, 1 }; + d.mipLevelCount = 1; + d.sampleCount = 1; + return wgpuDeviceCreateTexture(dev, &d); +} + +static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) { + WGPUTextureDescriptor d = {}; + d.format = WGPUTextureFormat_RGBA16Float; + d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; + d.dimension = WGPUTextureDimension_2D; + d.size = { (uint32_t)W, (uint32_t)H, 1 }; + d.mipLevelCount = 1; + d.sampleCount = 1; + return wgpuDeviceCreateTexture(dev, &d); +} + +static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { + WGPUTextureViewDescriptor d = {}; + d.format = fmt; + d.dimension = WGPUTextureViewDimension_2D; + d.mipLevelCount = 1; + d.arrayLayerCount = 1; + return wgpuTextureCreateView(tex, &d); +} + +// --------------------------------------------------------------------------- +// Run one CNN v3 forward pass and return output pixels +// --------------------------------------------------------------------------- + +static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture, + int W, int H, + const uint32_t* feat0_u32, // W*H*4 + const uint32_t* feat1_u32, // W*H*4 + const uint32_t* weights_u32, // (TOTAL_F16+1)/2 + uint32_t weights_bytes, + std::vector<float>* enc0_out = nullptr, + std::vector<float>* dec1_out = nullptr) { + GpuContext ctx = fixture.ctx(); + + // Create input textures manually (with CopyDst for upload) + WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H); + WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H); + WGPUTexture out_tex = make_output_tex(ctx.device, W, H); + + WGPUTextureView feat0_view = + make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint); + WGPUTextureView feat1_view = + make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint); + WGPUTextureView out_view = + make_view(out_tex, WGPUTextureFormat_RGBA16Float); + + // Upload feat texture data + auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) { + WGPUTexelCopyTextureInfo dst_tex = {}; + dst_tex.texture = tex; + WGPUTexelCopyBufferLayout layout = {}; + layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel + layout.rowsPerImage = (uint32_t)H; + WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 }; + wgpuQueueWriteTexture(ctx.queue, &dst_tex, data, + (size_t)(W * H * 16), &layout, &ext); + }; + upload_tex(feat0_tex, feat0_u32); + upload_tex(feat1_tex, feat1_u32); + + // Wire into NodeRegistry via external views + NodeRegistry registry(ctx.device, W, H); + registry.set_external_view("feat0", feat0_view); + registry.set_external_view("feat1", feat1_view); + registry.set_external_view("cnn3_out", out_view); + + CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn3_out"}, 0.0f, 1000.0f); + effect.declare_nodes(registry); + + if (weights_u32) { + effect.upload_weights(ctx.queue, weights_u32, weights_bytes); + } + + // Run 5 compute passes + WGPUCommandEncoder enc = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + UniformsSequenceParams params = {}; + params.resolution = { (float)W, (float)H }; + params.aspect_ratio = 1.0f; + effect.render(enc, params, registry); + + WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(ctx.queue, 1, &cmds); + wgpuCommandBufferRelease(cmds); + wgpuCommandEncoderRelease(enc); + wgpuDevicePoll(ctx.device, true, nullptr); + + // Read back output + auto pixels = readback_rgba16float(ctx.device, ctx.queue, out_tex, W, H); + + // Optional: read back intermediate layers + if (enc0_out) { + WGPUTexture enc0_tex = registry.get_texture("cnn3_out_enc0"); + *enc0_out = readback_rgba16float(ctx.device, ctx.queue, enc0_tex, W, H); + } + if (dec1_out) { + WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1"); + // dec1 is rgba16float written at half-res (W/2, H/2) — read only valid region + *dec1_out = readback_rgba16float(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2); + } + + // Cleanup + wgpuTextureViewRelease(feat0_view); + wgpuTextureViewRelease(feat1_view); + wgpuTextureViewRelease(out_view); + wgpuTextureRelease(feat0_tex); + wgpuTextureRelease(feat1_tex); + wgpuTextureRelease(out_tex); + + return pixels; +} + +extern void InitShaderComposer(); + +// --------------------------------------------------------------------------- +// Test 1: zero weights → sigmoid(ReLU(0)) = 0.5 for all pixels/channels +// --------------------------------------------------------------------------- + +static int test_zero_weights() { + fprintf(stdout, " [cnn_v3_parity] test_zero_weights...\n"); + + WebGPUTestFixture fixture; + if (!fixture.init()) { + fprintf(stdout, " ⚠ WebGPU unavailable — skip\n"); + return 1; + } + InitShaderComposer(); + + const int W = 8, H = 8; + std::vector<uint32_t> feat0(W * H * 4, 0u); + std::vector<uint32_t> feat1(W * H * 4, 0u); + + auto pixels = run_cnn_v3(fixture, W, H, + feat0.data(), feat1.data(), + nullptr, 0); // null = zero weights (default) + + // Expected: sigmoid(0) = 0.5 exactly + const float expected = 0.5f; + const float tol = 1.0f / 255.0f; + float max_err = 0.0f; + for (float v : pixels) + max_err = fmaxf(max_err, fabsf(v - expected)); + + if (max_err > tol) { + fprintf(stderr, " ✗ zero_weights: max_err=%.5f > %.5f\n", max_err, tol); + return 0; + } + fprintf(stdout, " ✓ zero_weights: max_err=%.2e OK\n", max_err); + return 1; +} + +// --------------------------------------------------------------------------- +// Test 2: random weights — compare to Python reference test vectors +// --------------------------------------------------------------------------- + +static int test_random_weights() { + fprintf(stdout, " [cnn_v3_parity] test_random_weights (seed=42)...\n"); + + WebGPUTestFixture fixture; + if (!fixture.init()) { + fprintf(stdout, " ⚠ WebGPU unavailable — skip\n"); + return 1; + } + InitShaderComposer(); + + const int W = kCnnV3TestW, H = kCnnV3TestH; + const uint32_t weights_bytes = + (uint32_t)sizeof(kCnnV3TestWeightsU32); + + std::vector<float> enc0_pixels, dec1_pixels; + auto pixels = run_cnn_v3(fixture, W, H, + kCnnV3TestFeat0U32, kCnnV3TestFeat1U32, + kCnnV3TestWeightsU32, weights_bytes, + &enc0_pixels, &dec1_pixels); + + // Check enc0 layer first + const float tol = 1.0f / 255.0f; + float enc0_max_err = 0.0f; + int enc0_worst = -1; + for (int i = 0; i < W * H * 4; ++i) { + float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]); + float err = fabsf(enc0_pixels[i] - ref); + if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; } + } + if (enc0_max_err > tol) { + int px = enc0_worst / 4, ch = enc0_worst % 4; + fprintf(stderr, " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" + " gpu=%.5f ref=%.5f\n", + enc0_max_err, tol, px, ch, + enc0_pixels[enc0_worst], + fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst])); + } else { + fprintf(stdout, " ✓ enc0: max_err=%.2e OK\n", enc0_max_err); + } + + // Check dec1 layer (half-res: W/2 x H/2 x 4) + float dec1_max_err = 0.0f; + int dec1_worst = -1; + int dec1_n = (W / 2) * (H / 2) * 4; + for (int i = 0; i < dec1_n; ++i) { + float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]); + float err = fabsf(dec1_pixels[i] - ref); + if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; } + } + if (dec1_max_err > tol) { + int px = dec1_worst / 4, ch = dec1_worst % 4; + fprintf(stderr, " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" + " gpu=%.5f ref=%.5f\n", + dec1_max_err, tol, px, ch, + dec1_pixels[dec1_worst], + fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst])); + } else { + fprintf(stdout, " ✓ dec1: max_err=%.2e OK\n", dec1_max_err); + } + + // Compare final output with Python reference (1/255 tolerance) + float max_err = 0.0f; + int worst = -1; + int n = W * H * 4; + for (int i = 0; i < n; ++i) { + float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]); + float err = fabsf(pixels[i] - ref); + if (err > max_err) { max_err = err; worst = i; } + } + + if (max_err > tol) { + int px = worst / 4, ch = worst % 4; + fprintf(stderr, " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d" + " gpu=%.5f ref=%.5f\n", + max_err, tol, px, ch, + pixels[worst], + fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst])); + return 0; + } + fprintf(stdout, " ✓ random_weights: max_err=%.2e OK\n", max_err); + return 1; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +int main() { + int pass = 0, total = 0; + + ++total; pass += test_zero_weights(); + ++total; pass += test_random_weights(); + + fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total); + return (pass == total) ? 0 : 1; +} |
