diff options
Diffstat (limited to 'src/tests/gpu/test_cnn_v3_parity.cc')
| -rw-r--r-- | src/tests/gpu/test_cnn_v3_parity.cc | 370 |
1 files changed, 370 insertions, 0 deletions
diff --git a/src/tests/gpu/test_cnn_v3_parity.cc b/src/tests/gpu/test_cnn_v3_parity.cc new file mode 100644 index 0000000..608decb --- /dev/null +++ b/src/tests/gpu/test_cnn_v3_parity.cc @@ -0,0 +1,370 @@ +// CNN v3 parity test: validates WGSL shaders against Python reference. +// Two checks: +// 1. Zero-weight test (deterministic): output must be sigmoid(0) = 0.5 +// 2. Random-weight test: output must match Python-generated test vectors +// (within 1/255 per pixel) + +#include "../common/webgpu_test_fixture.h" +#include "cnn_v3/src/cnn_v3_effect.h" +#include "gpu/sequence.h" +#include "../../cnn_v3/test_vectors.h" + +#include <cassert> +#include <cmath> +#include <cstdio> +#include <vector> + +// --------------------------------------------------------------------------- +// fp16 decode (matches GPU read) +// --------------------------------------------------------------------------- + +static float fp16_bits_to_f32(uint16_t h) { + uint32_t sign = (h & 0x8000u) << 16; + uint32_t exp = (h & 0x7C00u) >> 10; + uint32_t mant = (h & 0x03FFu); + if (exp == 0 && mant == 0) { + float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r; + } + if (exp == 31) { + uint32_t b = sign | 0x7F800000u | (mant << 13); + float r; __builtin_memcpy(&r, &b, 4); return r; + } + uint32_t b = sign | ((exp + 112) << 23) | (mant << 13); + float r; __builtin_memcpy(&r, &b, 4); return r; +} + +// --------------------------------------------------------------------------- +// Raw RGBA16Float readback → flat array of f32 (one per channel per pixel) +// --------------------------------------------------------------------------- + +struct MapState { bool done = false; WGPUMapAsyncStatus status; }; + +static std::vector<float> readback_rgba16float(WGPUDevice device, + WGPUQueue queue, + WGPUTexture tex, + int W, int H) { + const uint32_t bytes_per_px = 8; // 4 × f16 + const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); + const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; + const size_t buf_size = aligned_bpr * (size_t)H; + + WGPUBufferDescriptor bd = {}; + bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; + bd.size = buf_size; + WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); + + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); + WGPUTexelCopyTextureInfo src = {}; + src.texture = tex; + WGPUTexelCopyBufferInfo dst = {}; + dst.buffer = staging; + dst.layout.bytesPerRow = aligned_bpr; + dst.layout.rowsPerImage = (uint32_t)H; + WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 }; + wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent); + WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(queue, 1, &cmds); + wgpuCommandBufferRelease(cmds); + wgpuCommandEncoderRelease(enc); + wgpuDevicePoll(device, true, nullptr); + + MapState ms = {}; + WGPUBufferMapCallbackInfo mi = {}; + mi.mode = WGPUCallbackMode_AllowProcessEvents; + mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { + auto* st = (MapState*)u; + st->status = s; st->done = true; + }; + mi.userdata1 = &ms; + wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); + for (int i = 0; i < 100 && !ms.done; ++i) + wgpuDevicePoll(device, true, nullptr); + + std::vector<float> result(W * H * 4, 0.0f); + if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { + const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange( + staging, 0, buf_size); + if (mapped) { + for (int y = 0; y < H; ++y) { + const uint16_t* row = + (const uint16_t*)(mapped + (size_t)y * aligned_bpr); + for (int x = 0; x < W; ++x) { + for (int c = 0; c < 4; ++c) { + result[(y * W + x) * 4 + c] = + fp16_bits_to_f32(row[x * 4 + c]); + } + } + } + } + } + wgpuBufferUnmap(staging); + wgpuBufferRelease(staging); + return result; +} + +// --------------------------------------------------------------------------- +// Helper: create rgba32uint texture with TextureBinding | CopyDst +// --------------------------------------------------------------------------- + +static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) { + WGPUTextureDescriptor d = {}; + d.format = WGPUTextureFormat_RGBA32Uint; + d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; + d.dimension = WGPUTextureDimension_2D; + d.size = { (uint32_t)W, (uint32_t)H, 1 }; + d.mipLevelCount = 1; + d.sampleCount = 1; + return wgpuDeviceCreateTexture(dev, &d); +} + +static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) { + WGPUTextureDescriptor d = {}; + d.format = WGPUTextureFormat_RGBA16Float; + d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; + d.dimension = WGPUTextureDimension_2D; + d.size = { (uint32_t)W, (uint32_t)H, 1 }; + d.mipLevelCount = 1; + d.sampleCount = 1; + return wgpuDeviceCreateTexture(dev, &d); +} + +static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { + WGPUTextureViewDescriptor d = {}; + d.format = fmt; + d.dimension = WGPUTextureViewDimension_2D; + d.mipLevelCount = 1; + d.arrayLayerCount = 1; + return wgpuTextureCreateView(tex, &d); +} + +// --------------------------------------------------------------------------- +// Run one CNN v3 forward pass and return output pixels +// --------------------------------------------------------------------------- + +static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture, + int W, int H, + const uint32_t* feat0_u32, // W*H*4 + const uint32_t* feat1_u32, // W*H*4 + const uint32_t* weights_u32, // (TOTAL_F16+1)/2 + uint32_t weights_bytes, + std::vector<float>* enc0_out = nullptr, + std::vector<float>* dec1_out = nullptr) { + GpuContext ctx = fixture.ctx(); + + // Create input textures manually (with CopyDst for upload) + WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H); + WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H); + WGPUTexture out_tex = make_output_tex(ctx.device, W, H); + + WGPUTextureView feat0_view = + make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint); + WGPUTextureView feat1_view = + make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint); + WGPUTextureView out_view = + make_view(out_tex, WGPUTextureFormat_RGBA16Float); + + // Upload feat texture data + auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) { + WGPUTexelCopyTextureInfo dst_tex = {}; + dst_tex.texture = tex; + WGPUTexelCopyBufferLayout layout = {}; + layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel + layout.rowsPerImage = (uint32_t)H; + WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 }; + wgpuQueueWriteTexture(ctx.queue, &dst_tex, data, + (size_t)(W * H * 16), &layout, &ext); + }; + upload_tex(feat0_tex, feat0_u32); + upload_tex(feat1_tex, feat1_u32); + + // Wire into NodeRegistry via external views + NodeRegistry registry(ctx.device, W, H); + registry.set_external_view("feat0", feat0_view); + registry.set_external_view("feat1", feat1_view); + registry.set_external_view("cnn3_out", out_view); + + CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn3_out"}, 0.0f, 1000.0f); + effect.declare_nodes(registry); + + if (weights_u32) { + effect.upload_weights(ctx.queue, weights_u32, weights_bytes); + } + + // Run 5 compute passes + WGPUCommandEncoder enc = + wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + UniformsSequenceParams params = {}; + params.resolution = { (float)W, (float)H }; + params.aspect_ratio = 1.0f; + effect.render(enc, params, registry); + + WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(ctx.queue, 1, &cmds); + wgpuCommandBufferRelease(cmds); + wgpuCommandEncoderRelease(enc); + wgpuDevicePoll(ctx.device, true, nullptr); + + // Read back output + auto pixels = readback_rgba16float(ctx.device, ctx.queue, out_tex, W, H); + + // Optional: read back intermediate layers + if (enc0_out) { + WGPUTexture enc0_tex = registry.get_texture("cnn3_out_enc0"); + *enc0_out = readback_rgba16float(ctx.device, ctx.queue, enc0_tex, W, H); + } + if (dec1_out) { + WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1"); + // dec1 is rgba16float written at half-res (W/2, H/2) — read only valid region + *dec1_out = readback_rgba16float(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2); + } + + // Cleanup + wgpuTextureViewRelease(feat0_view); + wgpuTextureViewRelease(feat1_view); + wgpuTextureViewRelease(out_view); + wgpuTextureRelease(feat0_tex); + wgpuTextureRelease(feat1_tex); + wgpuTextureRelease(out_tex); + + return pixels; +} + +extern void InitShaderComposer(); + +// --------------------------------------------------------------------------- +// Test 1: zero weights → sigmoid(ReLU(0)) = 0.5 for all pixels/channels +// --------------------------------------------------------------------------- + +static int test_zero_weights() { + fprintf(stdout, " [cnn_v3_parity] test_zero_weights...\n"); + + WebGPUTestFixture fixture; + if (!fixture.init()) { + fprintf(stdout, " ⚠ WebGPU unavailable — skip\n"); + return 1; + } + InitShaderComposer(); + + const int W = 8, H = 8; + std::vector<uint32_t> feat0(W * H * 4, 0u); + std::vector<uint32_t> feat1(W * H * 4, 0u); + + auto pixels = run_cnn_v3(fixture, W, H, + feat0.data(), feat1.data(), + nullptr, 0); // null = zero weights (default) + + // Expected: sigmoid(0) = 0.5 exactly + const float expected = 0.5f; + const float tol = 1.0f / 255.0f; + float max_err = 0.0f; + for (float v : pixels) + max_err = fmaxf(max_err, fabsf(v - expected)); + + if (max_err > tol) { + fprintf(stderr, " ✗ zero_weights: max_err=%.5f > %.5f\n", max_err, tol); + return 0; + } + fprintf(stdout, " ✓ zero_weights: max_err=%.2e OK\n", max_err); + return 1; +} + +// --------------------------------------------------------------------------- +// Test 2: random weights — compare to Python reference test vectors +// --------------------------------------------------------------------------- + +static int test_random_weights() { + fprintf(stdout, " [cnn_v3_parity] test_random_weights (seed=42)...\n"); + + WebGPUTestFixture fixture; + if (!fixture.init()) { + fprintf(stdout, " ⚠ WebGPU unavailable — skip\n"); + return 1; + } + InitShaderComposer(); + + const int W = kCnnV3TestW, H = kCnnV3TestH; + const uint32_t weights_bytes = + (uint32_t)sizeof(kCnnV3TestWeightsU32); + + std::vector<float> enc0_pixels, dec1_pixels; + auto pixels = run_cnn_v3(fixture, W, H, + kCnnV3TestFeat0U32, kCnnV3TestFeat1U32, + kCnnV3TestWeightsU32, weights_bytes, + &enc0_pixels, &dec1_pixels); + + // Check enc0 layer first + const float tol = 1.0f / 255.0f; + float enc0_max_err = 0.0f; + int enc0_worst = -1; + for (int i = 0; i < W * H * 4; ++i) { + float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]); + float err = fabsf(enc0_pixels[i] - ref); + if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; } + } + if (enc0_max_err > tol) { + int px = enc0_worst / 4, ch = enc0_worst % 4; + fprintf(stderr, " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" + " gpu=%.5f ref=%.5f\n", + enc0_max_err, tol, px, ch, + enc0_pixels[enc0_worst], + fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst])); + } else { + fprintf(stdout, " ✓ enc0: max_err=%.2e OK\n", enc0_max_err); + } + + // Check dec1 layer (half-res: W/2 x H/2 x 4) + float dec1_max_err = 0.0f; + int dec1_worst = -1; + int dec1_n = (W / 2) * (H / 2) * 4; + for (int i = 0; i < dec1_n; ++i) { + float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]); + float err = fabsf(dec1_pixels[i] - ref); + if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; } + } + if (dec1_max_err > tol) { + int px = dec1_worst / 4, ch = dec1_worst % 4; + fprintf(stderr, " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" + " gpu=%.5f ref=%.5f\n", + dec1_max_err, tol, px, ch, + dec1_pixels[dec1_worst], + fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst])); + } else { + fprintf(stdout, " ✓ dec1: max_err=%.2e OK\n", dec1_max_err); + } + + // Compare final output with Python reference (1/255 tolerance) + float max_err = 0.0f; + int worst = -1; + int n = W * H * 4; + for (int i = 0; i < n; ++i) { + float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]); + float err = fabsf(pixels[i] - ref); + if (err > max_err) { max_err = err; worst = i; } + } + + if (max_err > tol) { + int px = worst / 4, ch = worst % 4; + fprintf(stderr, " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d" + " gpu=%.5f ref=%.5f\n", + max_err, tol, px, ch, + pixels[worst], + fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst])); + return 0; + } + fprintf(stdout, " ✓ random_weights: max_err=%.2e OK\n", max_err); + return 1; +} + +// --------------------------------------------------------------------------- +// Main +// --------------------------------------------------------------------------- + +int main() { + int pass = 0, total = 0; + + ++total; pass += test_zero_weights(); + ++total; pass += test_random_weights(); + + fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total); + return (pass == total) ? 0 : 1; +} |
