// CNN v3 parity test: validates WGSL shaders against Python reference. // Two checks: // 1. Zero-weight test (deterministic): output must be sigmoid(0) = 0.5 // 2. Random-weight test: output must match Python-generated test vectors // (within 1/255 per pixel) #include "../common/webgpu_test_fixture.h" #include "cnn_v3/src/cnn_v3_effect.h" #include "gpu/sequence.h" #include "../../cnn_v3/test_vectors.h" #include #include #include #include // --------------------------------------------------------------------------- // fp16 decode (matches GPU read) // --------------------------------------------------------------------------- static float fp16_bits_to_f32(uint16_t h) { uint32_t sign = (h & 0x8000u) << 16; uint32_t exp = (h & 0x7C00u) >> 10; uint32_t mant = (h & 0x03FFu); if (exp == 0 && mant == 0) { float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r; } if (exp == 31) { uint32_t b = sign | 0x7F800000u | (mant << 13); float r; __builtin_memcpy(&r, &b, 4); return r; } uint32_t b = sign | ((exp + 112) << 23) | (mant << 13); float r; __builtin_memcpy(&r, &b, 4); return r; } // --------------------------------------------------------------------------- // Raw RGBA16Float readback → flat array of f32 (one per channel per pixel) // --------------------------------------------------------------------------- struct MapState { bool done = false; WGPUMapAsyncStatus status; }; static std::vector readback_rgba16float(WGPUDevice device, WGPUQueue queue, WGPUTexture tex, int W, int H) { const uint32_t bytes_per_px = 8; // 4 × f16 const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; const size_t buf_size = aligned_bpr * (size_t)H; WGPUBufferDescriptor bd = {}; bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; bd.size = buf_size; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); WGPUTexelCopyTextureInfo src = {}; src.texture = tex; WGPUTexelCopyBufferInfo dst = {}; dst.buffer = staging; dst.layout.bytesPerRow = aligned_bpr; dst.layout.rowsPerImage = (uint32_t)H; WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 }; wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent); WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); wgpuQueueSubmit(queue, 1, &cmds); wgpuCommandBufferRelease(cmds); wgpuCommandEncoderRelease(enc); wgpuDevicePoll(device, true, nullptr); MapState ms = {}; WGPUBufferMapCallbackInfo mi = {}; mi.mode = WGPUCallbackMode_AllowProcessEvents; mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { auto* st = (MapState*)u; st->status = s; st->done = true; }; mi.userdata1 = &ms; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); for (int i = 0; i < 100 && !ms.done; ++i) wgpuDevicePoll(device, true, nullptr); std::vector result(W * H * 4, 0.0f); if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange( staging, 0, buf_size); if (mapped) { for (int y = 0; y < H; ++y) { const uint16_t* row = (const uint16_t*)(mapped + (size_t)y * aligned_bpr); for (int x = 0; x < W; ++x) { for (int c = 0; c < 4; ++c) { result[(y * W + x) * 4 + c] = fp16_bits_to_f32(row[x * 4 + c]); } } } } } wgpuBufferUnmap(staging); wgpuBufferRelease(staging); return result; } // --------------------------------------------------------------------------- // Helper: create rgba32uint texture with TextureBinding | CopyDst // --------------------------------------------------------------------------- static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) { WGPUTextureDescriptor d = {}; d.format = WGPUTextureFormat_RGBA32Uint; d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; d.dimension = WGPUTextureDimension_2D; d.size = { (uint32_t)W, (uint32_t)H, 1 }; d.mipLevelCount = 1; d.sampleCount = 1; return wgpuDeviceCreateTexture(dev, &d); } static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) { WGPUTextureDescriptor d = {}; d.format = WGPUTextureFormat_RGBA16Float; d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; d.dimension = WGPUTextureDimension_2D; d.size = { (uint32_t)W, (uint32_t)H, 1 }; d.mipLevelCount = 1; d.sampleCount = 1; return wgpuDeviceCreateTexture(dev, &d); } static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { WGPUTextureViewDescriptor d = {}; d.format = fmt; d.dimension = WGPUTextureViewDimension_2D; d.mipLevelCount = 1; d.arrayLayerCount = 1; return wgpuTextureCreateView(tex, &d); } // --------------------------------------------------------------------------- // Run one CNN v3 forward pass and return output pixels // --------------------------------------------------------------------------- static std::vector run_cnn_v3(WebGPUTestFixture& fixture, int W, int H, const uint32_t* feat0_u32, // W*H*4 const uint32_t* feat1_u32, // W*H*4 const uint32_t* weights_u32, // (TOTAL_F16+1)/2 uint32_t weights_bytes, std::vector* enc0_out = nullptr, std::vector* dec1_out = nullptr) { GpuContext ctx = fixture.ctx(); // Create input textures manually (with CopyDst for upload) WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H); WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H); WGPUTexture out_tex = make_output_tex(ctx.device, W, H); WGPUTextureView feat0_view = make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint); WGPUTextureView feat1_view = make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint); WGPUTextureView out_view = make_view(out_tex, WGPUTextureFormat_RGBA16Float); // Upload feat texture data auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) { WGPUTexelCopyTextureInfo dst_tex = {}; dst_tex.texture = tex; WGPUTexelCopyBufferLayout layout = {}; layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel layout.rowsPerImage = (uint32_t)H; WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 }; wgpuQueueWriteTexture(ctx.queue, &dst_tex, data, (size_t)(W * H * 16), &layout, &ext); }; upload_tex(feat0_tex, feat0_u32); upload_tex(feat1_tex, feat1_u32); // Wire into NodeRegistry via external views NodeRegistry registry(ctx.device, W, H); registry.set_external_view("feat0", feat0_view); registry.set_external_view("feat1", feat1_view); registry.set_external_view("cnn3_out", out_view); CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn3_out"}, 0.0f, 1000.0f); effect.declare_nodes(registry); if (weights_u32) { effect.upload_weights(ctx.queue, weights_u32, weights_bytes); } // Run 5 compute passes WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); UniformsSequenceParams params = {}; params.resolution = { (float)W, (float)H }; params.aspect_ratio = 1.0f; effect.render(enc, params, registry); WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); wgpuQueueSubmit(ctx.queue, 1, &cmds); wgpuCommandBufferRelease(cmds); wgpuCommandEncoderRelease(enc); wgpuDevicePoll(ctx.device, true, nullptr); // Read back output auto pixels = readback_rgba16float(ctx.device, ctx.queue, out_tex, W, H); // Optional: read back intermediate layers if (enc0_out) { WGPUTexture enc0_tex = registry.get_texture("cnn3_out_enc0"); *enc0_out = readback_rgba16float(ctx.device, ctx.queue, enc0_tex, W, H); } if (dec1_out) { WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1"); // dec1 is rgba16float written at half-res (W/2, H/2) — read only valid region *dec1_out = readback_rgba16float(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2); } // Cleanup wgpuTextureViewRelease(feat0_view); wgpuTextureViewRelease(feat1_view); wgpuTextureViewRelease(out_view); wgpuTextureRelease(feat0_tex); wgpuTextureRelease(feat1_tex); wgpuTextureRelease(out_tex); return pixels; } extern void InitShaderComposer(); // --------------------------------------------------------------------------- // Test 1: zero weights → sigmoid(ReLU(0)) = 0.5 for all pixels/channels // --------------------------------------------------------------------------- static int test_zero_weights() { fprintf(stdout, " [cnn_v3_parity] test_zero_weights...\n"); WebGPUTestFixture fixture; if (!fixture.init()) { fprintf(stdout, " ⚠ WebGPU unavailable — skip\n"); return 1; } InitShaderComposer(); const int W = 8, H = 8; std::vector feat0(W * H * 4, 0u); std::vector feat1(W * H * 4, 0u); auto pixels = run_cnn_v3(fixture, W, H, feat0.data(), feat1.data(), nullptr, 0); // null = zero weights (default) // Expected: sigmoid(0) = 0.5 exactly const float expected = 0.5f; const float tol = 1.0f / 255.0f; float max_err = 0.0f; for (float v : pixels) max_err = fmaxf(max_err, fabsf(v - expected)); if (max_err > tol) { fprintf(stderr, " ✗ zero_weights: max_err=%.5f > %.5f\n", max_err, tol); return 0; } fprintf(stdout, " ✓ zero_weights: max_err=%.2e OK\n", max_err); return 1; } // --------------------------------------------------------------------------- // Test 2: random weights — compare to Python reference test vectors // --------------------------------------------------------------------------- static int test_random_weights() { fprintf(stdout, " [cnn_v3_parity] test_random_weights (seed=42)...\n"); WebGPUTestFixture fixture; if (!fixture.init()) { fprintf(stdout, " ⚠ WebGPU unavailable — skip\n"); return 1; } InitShaderComposer(); const int W = kCnnV3TestW, H = kCnnV3TestH; const uint32_t weights_bytes = (uint32_t)sizeof(kCnnV3TestWeightsU32); std::vector enc0_pixels, dec1_pixels; auto pixels = run_cnn_v3(fixture, W, H, kCnnV3TestFeat0U32, kCnnV3TestFeat1U32, kCnnV3TestWeightsU32, weights_bytes, &enc0_pixels, &dec1_pixels); // Check enc0 layer first const float tol = 1.0f / 255.0f; float enc0_max_err = 0.0f; int enc0_worst = -1; for (int i = 0; i < W * H * 4; ++i) { float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]); float err = fabsf(enc0_pixels[i] - ref); if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; } } bool enc0_ok = (enc0_max_err <= tol); if (!enc0_ok) { int px = enc0_worst / 4, ch = enc0_worst % 4; fprintf(stderr, " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" " gpu=%.5f ref=%.5f\n", enc0_max_err, tol, px, ch, enc0_pixels[enc0_worst], fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst])); } else { fprintf(stdout, " ✓ enc0: max_err=%.2e OK\n", enc0_max_err); } // Check dec1 layer (half-res: W/2 x H/2 x 4) float dec1_max_err = 0.0f; int dec1_worst = -1; int dec1_n = (W / 2) * (H / 2) * 4; for (int i = 0; i < dec1_n; ++i) { float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]); float err = fabsf(dec1_pixels[i] - ref); if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; } } bool dec1_ok = (dec1_max_err <= tol); if (!dec1_ok) { int px = dec1_worst / 4, ch = dec1_worst % 4; fprintf(stderr, " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" " gpu=%.5f ref=%.5f\n", dec1_max_err, tol, px, ch, dec1_pixels[dec1_worst], fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst])); } else { fprintf(stdout, " ✓ dec1: max_err=%.2e OK\n", dec1_max_err); } // Compare final output with Python reference (1/255 tolerance) float max_err = 0.0f; int worst = -1; int n = W * H * 4; for (int i = 0; i < n; ++i) { float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]); float err = fabsf(pixels[i] - ref); if (err > max_err) { max_err = err; worst = i; } } bool out_ok = (max_err <= tol); if (!out_ok) { int px = worst / 4, ch = worst % 4; fprintf(stderr, " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d" " gpu=%.5f ref=%.5f\n", max_err, tol, px, ch, pixels[worst], fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst])); } else { fprintf(stdout, " ✓ random_weights: max_err=%.2e OK\n", max_err); } return (enc0_ok && dec1_ok && out_ok) ? 1 : 0; } // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- int main() { int pass = 0, total = 0; ++total; pass += test_zero_weights(); ++total; pass += test_random_weights(); fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total); return (pass == total) ? 0 : 1; }