summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/gpu/sequence.h3
-rw-r--r--src/tests/gpu/test_cnn_v3_parity.cc370
2 files changed, 373 insertions, 0 deletions
diff --git a/src/gpu/sequence.h b/src/gpu/sequence.h
index 4592082..04482fb 100644
--- a/src/gpu/sequence.h
+++ b/src/gpu/sequence.h
@@ -71,6 +71,9 @@ class NodeRegistry {
void set_external_view(const std::string& name, WGPUTextureView view);
+ int default_width() const { return default_width_; }
+ int default_height() const { return default_height_; }
+
private:
WGPUDevice device_;
int default_width_;
diff --git a/src/tests/gpu/test_cnn_v3_parity.cc b/src/tests/gpu/test_cnn_v3_parity.cc
new file mode 100644
index 0000000..608decb
--- /dev/null
+++ b/src/tests/gpu/test_cnn_v3_parity.cc
@@ -0,0 +1,370 @@
+// CNN v3 parity test: validates WGSL shaders against Python reference.
+// Two checks:
+// 1. Zero-weight test (deterministic): output must be sigmoid(0) = 0.5
+// 2. Random-weight test: output must match Python-generated test vectors
+// (within 1/255 per pixel)
+
+#include "../common/webgpu_test_fixture.h"
+#include "cnn_v3/src/cnn_v3_effect.h"
+#include "gpu/sequence.h"
+#include "../../cnn_v3/test_vectors.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// fp16 decode (matches GPU read)
+// ---------------------------------------------------------------------------
+
+static float fp16_bits_to_f32(uint16_t h) {
+ uint32_t sign = (h & 0x8000u) << 16;
+ uint32_t exp = (h & 0x7C00u) >> 10;
+ uint32_t mant = (h & 0x03FFu);
+ if (exp == 0 && mant == 0) {
+ float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r;
+ }
+ if (exp == 31) {
+ uint32_t b = sign | 0x7F800000u | (mant << 13);
+ float r; __builtin_memcpy(&r, &b, 4); return r;
+ }
+ uint32_t b = sign | ((exp + 112) << 23) | (mant << 13);
+ float r; __builtin_memcpy(&r, &b, 4); return r;
+}
+
+// ---------------------------------------------------------------------------
+// Raw RGBA16Float readback → flat array of f32 (one per channel per pixel)
+// ---------------------------------------------------------------------------
+
+struct MapState { bool done = false; WGPUMapAsyncStatus status; };
+
+static std::vector<float> readback_rgba16float(WGPUDevice device,
+ WGPUQueue queue,
+ WGPUTexture tex,
+ int W, int H) {
+ const uint32_t bytes_per_px = 8; // 4 × f16
+ const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px);
+ const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u;
+ const size_t buf_size = aligned_bpr * (size_t)H;
+
+ WGPUBufferDescriptor bd = {};
+ bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+ bd.size = buf_size;
+ WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd);
+
+ WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr);
+ WGPUTexelCopyTextureInfo src = {};
+ src.texture = tex;
+ WGPUTexelCopyBufferInfo dst = {};
+ dst.buffer = staging;
+ dst.layout.bytesPerRow = aligned_bpr;
+ dst.layout.rowsPerImage = (uint32_t)H;
+ WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 };
+ wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent);
+ WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+ wgpuQueueSubmit(queue, 1, &cmds);
+ wgpuCommandBufferRelease(cmds);
+ wgpuCommandEncoderRelease(enc);
+ wgpuDevicePoll(device, true, nullptr);
+
+ MapState ms = {};
+ WGPUBufferMapCallbackInfo mi = {};
+ mi.mode = WGPUCallbackMode_AllowProcessEvents;
+ mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) {
+ auto* st = (MapState*)u;
+ st->status = s; st->done = true;
+ };
+ mi.userdata1 = &ms;
+ wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi);
+ for (int i = 0; i < 100 && !ms.done; ++i)
+ wgpuDevicePoll(device, true, nullptr);
+
+ std::vector<float> result(W * H * 4, 0.0f);
+ if (ms.done && ms.status == WGPUMapAsyncStatus_Success) {
+ const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange(
+ staging, 0, buf_size);
+ if (mapped) {
+ for (int y = 0; y < H; ++y) {
+ const uint16_t* row =
+ (const uint16_t*)(mapped + (size_t)y * aligned_bpr);
+ for (int x = 0; x < W; ++x) {
+ for (int c = 0; c < 4; ++c) {
+ result[(y * W + x) * 4 + c] =
+ fp16_bits_to_f32(row[x * 4 + c]);
+ }
+ }
+ }
+ }
+ }
+ wgpuBufferUnmap(staging);
+ wgpuBufferRelease(staging);
+ return result;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: create rgba32uint texture with TextureBinding | CopyDst
+// ---------------------------------------------------------------------------
+
+static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) {
+ WGPUTextureDescriptor d = {};
+ d.format = WGPUTextureFormat_RGBA32Uint;
+ d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst;
+ d.dimension = WGPUTextureDimension_2D;
+ d.size = { (uint32_t)W, (uint32_t)H, 1 };
+ d.mipLevelCount = 1;
+ d.sampleCount = 1;
+ return wgpuDeviceCreateTexture(dev, &d);
+}
+
+static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) {
+ WGPUTextureDescriptor d = {};
+ d.format = WGPUTextureFormat_RGBA16Float;
+ d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc;
+ d.dimension = WGPUTextureDimension_2D;
+ d.size = { (uint32_t)W, (uint32_t)H, 1 };
+ d.mipLevelCount = 1;
+ d.sampleCount = 1;
+ return wgpuDeviceCreateTexture(dev, &d);
+}
+
+static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) {
+ WGPUTextureViewDescriptor d = {};
+ d.format = fmt;
+ d.dimension = WGPUTextureViewDimension_2D;
+ d.mipLevelCount = 1;
+ d.arrayLayerCount = 1;
+ return wgpuTextureCreateView(tex, &d);
+}
+
+// ---------------------------------------------------------------------------
+// Run one CNN v3 forward pass and return output pixels
+// ---------------------------------------------------------------------------
+
+static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture,
+ int W, int H,
+ const uint32_t* feat0_u32, // W*H*4
+ const uint32_t* feat1_u32, // W*H*4
+ const uint32_t* weights_u32, // (TOTAL_F16+1)/2
+ uint32_t weights_bytes,
+ std::vector<float>* enc0_out = nullptr,
+ std::vector<float>* dec1_out = nullptr) {
+ GpuContext ctx = fixture.ctx();
+
+ // Create input textures manually (with CopyDst for upload)
+ WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H);
+ WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H);
+ WGPUTexture out_tex = make_output_tex(ctx.device, W, H);
+
+ WGPUTextureView feat0_view =
+ make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint);
+ WGPUTextureView feat1_view =
+ make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint);
+ WGPUTextureView out_view =
+ make_view(out_tex, WGPUTextureFormat_RGBA16Float);
+
+ // Upload feat texture data
+ auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) {
+ WGPUTexelCopyTextureInfo dst_tex = {};
+ dst_tex.texture = tex;
+ WGPUTexelCopyBufferLayout layout = {};
+ layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel
+ layout.rowsPerImage = (uint32_t)H;
+ WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 };
+ wgpuQueueWriteTexture(ctx.queue, &dst_tex, data,
+ (size_t)(W * H * 16), &layout, &ext);
+ };
+ upload_tex(feat0_tex, feat0_u32);
+ upload_tex(feat1_tex, feat1_u32);
+
+ // Wire into NodeRegistry via external views
+ NodeRegistry registry(ctx.device, W, H);
+ registry.set_external_view("feat0", feat0_view);
+ registry.set_external_view("feat1", feat1_view);
+ registry.set_external_view("cnn3_out", out_view);
+
+ CNNv3Effect effect(ctx, {"feat0", "feat1"}, {"cnn3_out"}, 0.0f, 1000.0f);
+ effect.declare_nodes(registry);
+
+ if (weights_u32) {
+ effect.upload_weights(ctx.queue, weights_u32, weights_bytes);
+ }
+
+ // Run 5 compute passes
+ WGPUCommandEncoder enc =
+ wgpuDeviceCreateCommandEncoder(ctx.device, nullptr);
+ UniformsSequenceParams params = {};
+ params.resolution = { (float)W, (float)H };
+ params.aspect_ratio = 1.0f;
+ effect.render(enc, params, registry);
+
+ WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr);
+ wgpuQueueSubmit(ctx.queue, 1, &cmds);
+ wgpuCommandBufferRelease(cmds);
+ wgpuCommandEncoderRelease(enc);
+ wgpuDevicePoll(ctx.device, true, nullptr);
+
+ // Read back output
+ auto pixels = readback_rgba16float(ctx.device, ctx.queue, out_tex, W, H);
+
+ // Optional: read back intermediate layers
+ if (enc0_out) {
+ WGPUTexture enc0_tex = registry.get_texture("cnn3_out_enc0");
+ *enc0_out = readback_rgba16float(ctx.device, ctx.queue, enc0_tex, W, H);
+ }
+ if (dec1_out) {
+ WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1");
+ // dec1 is rgba16float written at half-res (W/2, H/2) — read only valid region
+ *dec1_out = readback_rgba16float(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2);
+ }
+
+ // Cleanup
+ wgpuTextureViewRelease(feat0_view);
+ wgpuTextureViewRelease(feat1_view);
+ wgpuTextureViewRelease(out_view);
+ wgpuTextureRelease(feat0_tex);
+ wgpuTextureRelease(feat1_tex);
+ wgpuTextureRelease(out_tex);
+
+ return pixels;
+}
+
+extern void InitShaderComposer();
+
+// ---------------------------------------------------------------------------
+// Test 1: zero weights → sigmoid(ReLU(0)) = 0.5 for all pixels/channels
+// ---------------------------------------------------------------------------
+
+static int test_zero_weights() {
+ fprintf(stdout, " [cnn_v3_parity] test_zero_weights...\n");
+
+ WebGPUTestFixture fixture;
+ if (!fixture.init()) {
+ fprintf(stdout, " ⚠ WebGPU unavailable — skip\n");
+ return 1;
+ }
+ InitShaderComposer();
+
+ const int W = 8, H = 8;
+ std::vector<uint32_t> feat0(W * H * 4, 0u);
+ std::vector<uint32_t> feat1(W * H * 4, 0u);
+
+ auto pixels = run_cnn_v3(fixture, W, H,
+ feat0.data(), feat1.data(),
+ nullptr, 0); // null = zero weights (default)
+
+ // Expected: sigmoid(0) = 0.5 exactly
+ const float expected = 0.5f;
+ const float tol = 1.0f / 255.0f;
+ float max_err = 0.0f;
+ for (float v : pixels)
+ max_err = fmaxf(max_err, fabsf(v - expected));
+
+ if (max_err > tol) {
+ fprintf(stderr, " ✗ zero_weights: max_err=%.5f > %.5f\n", max_err, tol);
+ return 0;
+ }
+ fprintf(stdout, " ✓ zero_weights: max_err=%.2e OK\n", max_err);
+ return 1;
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: random weights — compare to Python reference test vectors
+// ---------------------------------------------------------------------------
+
+static int test_random_weights() {
+ fprintf(stdout, " [cnn_v3_parity] test_random_weights (seed=42)...\n");
+
+ WebGPUTestFixture fixture;
+ if (!fixture.init()) {
+ fprintf(stdout, " ⚠ WebGPU unavailable — skip\n");
+ return 1;
+ }
+ InitShaderComposer();
+
+ const int W = kCnnV3TestW, H = kCnnV3TestH;
+ const uint32_t weights_bytes =
+ (uint32_t)sizeof(kCnnV3TestWeightsU32);
+
+ std::vector<float> enc0_pixels, dec1_pixels;
+ auto pixels = run_cnn_v3(fixture, W, H,
+ kCnnV3TestFeat0U32, kCnnV3TestFeat1U32,
+ kCnnV3TestWeightsU32, weights_bytes,
+ &enc0_pixels, &dec1_pixels);
+
+ // Check enc0 layer first
+ const float tol = 1.0f / 255.0f;
+ float enc0_max_err = 0.0f;
+ int enc0_worst = -1;
+ for (int i = 0; i < W * H * 4; ++i) {
+ float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]);
+ float err = fabsf(enc0_pixels[i] - ref);
+ if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; }
+ }
+ if (enc0_max_err > tol) {
+ int px = enc0_worst / 4, ch = enc0_worst % 4;
+ fprintf(stderr, " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d"
+ " gpu=%.5f ref=%.5f\n",
+ enc0_max_err, tol, px, ch,
+ enc0_pixels[enc0_worst],
+ fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst]));
+ } else {
+ fprintf(stdout, " ✓ enc0: max_err=%.2e OK\n", enc0_max_err);
+ }
+
+ // Check dec1 layer (half-res: W/2 x H/2 x 4)
+ float dec1_max_err = 0.0f;
+ int dec1_worst = -1;
+ int dec1_n = (W / 2) * (H / 2) * 4;
+ for (int i = 0; i < dec1_n; ++i) {
+ float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]);
+ float err = fabsf(dec1_pixels[i] - ref);
+ if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; }
+ }
+ if (dec1_max_err > tol) {
+ int px = dec1_worst / 4, ch = dec1_worst % 4;
+ fprintf(stderr, " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d"
+ " gpu=%.5f ref=%.5f\n",
+ dec1_max_err, tol, px, ch,
+ dec1_pixels[dec1_worst],
+ fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst]));
+ } else {
+ fprintf(stdout, " ✓ dec1: max_err=%.2e OK\n", dec1_max_err);
+ }
+
+ // Compare final output with Python reference (1/255 tolerance)
+ float max_err = 0.0f;
+ int worst = -1;
+ int n = W * H * 4;
+ for (int i = 0; i < n; ++i) {
+ float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]);
+ float err = fabsf(pixels[i] - ref);
+ if (err > max_err) { max_err = err; worst = i; }
+ }
+
+ if (max_err > tol) {
+ int px = worst / 4, ch = worst % 4;
+ fprintf(stderr, " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d"
+ " gpu=%.5f ref=%.5f\n",
+ max_err, tol, px, ch,
+ pixels[worst],
+ fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst]));
+ return 0;
+ }
+ fprintf(stdout, " ✓ random_weights: max_err=%.2e OK\n", max_err);
+ return 1;
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+int main() {
+ int pass = 0, total = 0;
+
+ ++total; pass += test_zero_weights();
+ ++total; pass += test_random_weights();
+
+ fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total);
+ return (pass == total) ? 0 : 1;
+}