diff options
Diffstat (limited to 'src/tests/gpu/test_cnn_v3_parity.cc')
| -rw-r--r-- | src/tests/gpu/test_cnn_v3_parity.cc | 194 |
1 files changed, 105 insertions, 89 deletions
diff --git a/src/tests/gpu/test_cnn_v3_parity.cc b/src/tests/gpu/test_cnn_v3_parity.cc index 4fada5d..9663b09 100644 --- a/src/tests/gpu/test_cnn_v3_parity.cc +++ b/src/tests/gpu/test_cnn_v3_parity.cc @@ -4,10 +4,10 @@ // 2. Random-weight test: output must match Python-generated test vectors // (within 1/255 per pixel) +#include "../../cnn_v3/test_vectors.h" #include "../common/webgpu_test_fixture.h" #include "cnn_v3/src/cnn_v3_effect.h" #include "gpu/sequence.h" -#include "../../cnn_v3/test_vectors.h" #include <cassert> #include <cmath> @@ -20,37 +20,46 @@ static float fp16_bits_to_f32(uint16_t h) { uint32_t sign = (h & 0x8000u) << 16; - uint32_t exp = (h & 0x7C00u) >> 10; + uint32_t exp = (h & 0x7C00u) >> 10; uint32_t mant = (h & 0x03FFu); if (exp == 0 && mant == 0) { - float r; uint32_t b = sign; __builtin_memcpy(&r, &b, 4); return r; + float r; + uint32_t b = sign; + __builtin_memcpy(&r, &b, 4); + return r; } if (exp == 31) { uint32_t b = sign | 0x7F800000u | (mant << 13); - float r; __builtin_memcpy(&r, &b, 4); return r; + float r; + __builtin_memcpy(&r, &b, 4); + return r; } uint32_t b = sign | ((exp + 112) << 23) | (mant << 13); - float r; __builtin_memcpy(&r, &b, 4); return r; + float r; + __builtin_memcpy(&r, &b, 4); + return r; } // --------------------------------------------------------------------------- // Raw RGBA16Float readback → flat array of f32 (one per channel per pixel) // --------------------------------------------------------------------------- -struct MapState { bool done = false; WGPUMapAsyncStatus status; }; +struct MapState { + bool done = false; + WGPUMapAsyncStatus status; +}; static std::vector<float> readback_rgba16float(WGPUDevice device, - WGPUQueue queue, - WGPUTexture tex, - int W, int H) { - const uint32_t bytes_per_px = 8; // 4 × f16 - const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); - const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; - const size_t buf_size = aligned_bpr * (size_t)H; + WGPUQueue queue, WGPUTexture tex, + int W, int H) { + const uint32_t bytes_per_px = 8; // 4 × f16 + const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); + const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; + const size_t buf_size = aligned_bpr * (size_t)H; WGPUBufferDescriptor bd = {}; bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; - bd.size = buf_size; + bd.size = buf_size; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); @@ -58,9 +67,9 @@ static std::vector<float> readback_rgba16float(WGPUDevice device, src.texture = tex; WGPUTexelCopyBufferInfo dst = {}; dst.buffer = staging; - dst.layout.bytesPerRow = aligned_bpr; + dst.layout.bytesPerRow = aligned_bpr; dst.layout.rowsPerImage = (uint32_t)H; - WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 }; + WGPUExtent3D extent = {(uint32_t)W, (uint32_t)H, 1}; wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent); WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); wgpuQueueSubmit(queue, 1, &cmds); @@ -73,7 +82,8 @@ static std::vector<float> readback_rgba16float(WGPUDevice device, mi.mode = WGPUCallbackMode_AllowProcessEvents; mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { auto* st = (MapState*)u; - st->status = s; st->done = true; + st->status = s; + st->done = true; }; mi.userdata1 = &ms; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); @@ -82,16 +92,15 @@ static std::vector<float> readback_rgba16float(WGPUDevice device, std::vector<float> result(W * H * 4, 0.0f); if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { - const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange( - staging, 0, buf_size); + const uint8_t* mapped = + (const uint8_t*)wgpuBufferGetConstMappedRange(staging, 0, buf_size); if (mapped) { for (int y = 0; y < H; ++y) { const uint16_t* row = (const uint16_t*)(mapped + (size_t)y * aligned_bpr); for (int x = 0; x < W; ++x) { for (int c = 0; c < 4; ++c) { - result[(y * W + x) * 4 + c] = - fp16_bits_to_f32(row[x * 4 + c]); + result[(y * W + x) * 4 + c] = fp16_bits_to_f32(row[x * 4 + c]); } } } @@ -107,17 +116,17 @@ static std::vector<float> readback_rgba16float(WGPUDevice device, // --------------------------------------------------------------------------- static std::vector<float> readback_rgba32uint_8ch(WGPUDevice device, - WGPUQueue queue, - WGPUTexture tex, - int W, int H) { - const uint32_t bytes_per_px = 16; // 4 × u32 - const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); - const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; - const size_t buf_size = aligned_bpr * (size_t)H; + WGPUQueue queue, + WGPUTexture tex, int W, + int H) { + const uint32_t bytes_per_px = 16; // 4 × u32 + const uint32_t unaligned_bpr = (uint32_t)(W * bytes_per_px); + const uint32_t aligned_bpr = ((unaligned_bpr + 255u) / 256u) * 256u; + const size_t buf_size = aligned_bpr * (size_t)H; WGPUBufferDescriptor bd = {}; bd.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead; - bd.size = buf_size; + bd.size = buf_size; WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &bd); WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); @@ -125,9 +134,9 @@ static std::vector<float> readback_rgba32uint_8ch(WGPUDevice device, src.texture = tex; WGPUTexelCopyBufferInfo dst = {}; dst.buffer = staging; - dst.layout.bytesPerRow = aligned_bpr; + dst.layout.bytesPerRow = aligned_bpr; dst.layout.rowsPerImage = (uint32_t)H; - WGPUExtent3D extent = { (uint32_t)W, (uint32_t)H, 1 }; + WGPUExtent3D extent = {(uint32_t)W, (uint32_t)H, 1}; wgpuCommandEncoderCopyTextureToBuffer(enc, &src, &dst, &extent); WGPUCommandBuffer cmds = wgpuCommandEncoderFinish(enc, nullptr); wgpuQueueSubmit(queue, 1, &cmds); @@ -140,7 +149,8 @@ static std::vector<float> readback_rgba32uint_8ch(WGPUDevice device, mi.mode = WGPUCallbackMode_AllowProcessEvents; mi.callback = [](WGPUMapAsyncStatus s, WGPUStringView, void* u, void*) { auto* st = (MapState*)u; - st->status = s; st->done = true; + st->status = s; + st->done = true; }; mi.userdata1 = &ms; wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buf_size, mi); @@ -149,8 +159,8 @@ static std::vector<float> readback_rgba32uint_8ch(WGPUDevice device, std::vector<float> result(W * H * 8, 0.0f); if (ms.done && ms.status == WGPUMapAsyncStatus_Success) { - const uint8_t* mapped = (const uint8_t*)wgpuBufferGetConstMappedRange( - staging, 0, buf_size); + const uint8_t* mapped = + (const uint8_t*)wgpuBufferGetConstMappedRange(staging, 0, buf_size); if (mapped) { for (int y = 0; y < H; ++y) { const uint32_t* row = @@ -178,31 +188,31 @@ static std::vector<float> readback_rgba32uint_8ch(WGPUDevice device, static WGPUTexture make_feat_tex(WGPUDevice dev, int W, int H) { WGPUTextureDescriptor d = {}; - d.format = WGPUTextureFormat_RGBA32Uint; - d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; - d.dimension = WGPUTextureDimension_2D; - d.size = { (uint32_t)W, (uint32_t)H, 1 }; + d.format = WGPUTextureFormat_RGBA32Uint; + d.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; + d.dimension = WGPUTextureDimension_2D; + d.size = {(uint32_t)W, (uint32_t)H, 1}; d.mipLevelCount = 1; - d.sampleCount = 1; + d.sampleCount = 1; return wgpuDeviceCreateTexture(dev, &d); } static WGPUTexture make_output_tex(WGPUDevice dev, int W, int H) { WGPUTextureDescriptor d = {}; - d.format = WGPUTextureFormat_RGBA16Float; - d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; - d.dimension = WGPUTextureDimension_2D; - d.size = { (uint32_t)W, (uint32_t)H, 1 }; + d.format = WGPUTextureFormat_RGBA16Float; + d.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_CopySrc; + d.dimension = WGPUTextureDimension_2D; + d.size = {(uint32_t)W, (uint32_t)H, 1}; d.mipLevelCount = 1; - d.sampleCount = 1; + d.sampleCount = 1; return wgpuDeviceCreateTexture(dev, &d); } static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { WGPUTextureViewDescriptor d = {}; - d.format = fmt; - d.dimension = WGPUTextureViewDimension_2D; - d.mipLevelCount = 1; + d.format = fmt; + d.dimension = WGPUTextureViewDimension_2D; + d.mipLevelCount = 1; d.arrayLayerCount = 1; return wgpuTextureCreateView(tex, &d); } @@ -211,38 +221,36 @@ static WGPUTextureView make_view(WGPUTexture tex, WGPUTextureFormat fmt) { // Run one CNN v3 forward pass and return output pixels // --------------------------------------------------------------------------- -static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture, - int W, int H, - const uint32_t* feat0_u32, // W*H*4 - const uint32_t* feat1_u32, // W*H*4 - const uint32_t* weights_u32, // (TOTAL_F16+1)/2 - uint32_t weights_bytes, - std::vector<float>* enc0_out = nullptr, - std::vector<float>* dec1_out = nullptr) { +static std::vector<float> +run_cnn_v3(WebGPUTestFixture& fixture, int W, int H, + const uint32_t* feat0_u32, // W*H*4 + const uint32_t* feat1_u32, // W*H*4 + const uint32_t* weights_u32, // (TOTAL_F16+1)/2 + uint32_t weights_bytes, std::vector<float>* enc0_out = nullptr, + std::vector<float>* dec1_out = nullptr) { GpuContext ctx = fixture.ctx(); // Create input textures manually (with CopyDst for upload) WGPUTexture feat0_tex = make_feat_tex(ctx.device, W, H); WGPUTexture feat1_tex = make_feat_tex(ctx.device, W, H); - WGPUTexture out_tex = make_output_tex(ctx.device, W, H); + WGPUTexture out_tex = make_output_tex(ctx.device, W, H); WGPUTextureView feat0_view = make_view(feat0_tex, WGPUTextureFormat_RGBA32Uint); WGPUTextureView feat1_view = make_view(feat1_tex, WGPUTextureFormat_RGBA32Uint); - WGPUTextureView out_view = - make_view(out_tex, WGPUTextureFormat_RGBA16Float); + WGPUTextureView out_view = make_view(out_tex, WGPUTextureFormat_RGBA16Float); // Upload feat texture data auto upload_tex = [&](WGPUTexture tex, const uint32_t* data) { WGPUTexelCopyTextureInfo dst_tex = {}; dst_tex.texture = tex; WGPUTexelCopyBufferLayout layout = {}; - layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel + layout.bytesPerRow = (uint32_t)(W * 16); // 4 u32 per pixel layout.rowsPerImage = (uint32_t)H; - WGPUExtent3D ext = { (uint32_t)W, (uint32_t)H, 1 }; - wgpuQueueWriteTexture(ctx.queue, &dst_tex, data, - (size_t)(W * H * 16), &layout, &ext); + WGPUExtent3D ext = {(uint32_t)W, (uint32_t)H, 1}; + wgpuQueueWriteTexture(ctx.queue, &dst_tex, data, (size_t)(W * H * 16), + &layout, &ext); }; upload_tex(feat0_tex, feat0_u32); upload_tex(feat1_tex, feat1_u32); @@ -267,10 +275,9 @@ static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture, } // Run 5 compute passes - WGPUCommandEncoder enc = - wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(ctx.device, nullptr); UniformsSequenceParams params = {}; - params.resolution = { (float)W, (float)H }; + params.resolution = {(float)W, (float)H}; params.aspect_ratio = 1.0f; effect.render(enc, params, registry); @@ -292,7 +299,8 @@ static std::vector<float> run_cnn_v3(WebGPUTestFixture& fixture, if (dec1_out) { // dec1 is rgba32uint, 8ch (pack2x16float), half-res WGPUTexture dec1_tex = registry.get_texture("cnn3_out_dec1"); - *dec1_out = readback_rgba32uint_8ch(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2); + *dec1_out = + readback_rgba32uint_8ch(ctx.device, ctx.queue, dec1_tex, W / 2, H / 2); } // Cleanup @@ -326,9 +334,8 @@ static int test_zero_weights() { std::vector<uint32_t> feat0(W * H * 4, 0u); std::vector<uint32_t> feat1(W * H * 4, 0u); - auto pixels = run_cnn_v3(fixture, W, H, - feat0.data(), feat1.data(), - nullptr, 0); // null = zero weights (default) + auto pixels = run_cnn_v3(fixture, W, H, feat0.data(), feat1.data(), nullptr, + 0); // null = zero weights (default) // Expected: sigmoid(0) = 0.5 exactly const float expected = 0.5f; @@ -360,14 +367,12 @@ static int test_random_weights() { InitShaderComposer(); const int W = kCnnV3TestW, H = kCnnV3TestH; - const uint32_t weights_bytes = - (uint32_t)sizeof(kCnnV3TestWeightsU32); + const uint32_t weights_bytes = (uint32_t)sizeof(kCnnV3TestWeightsU32); std::vector<float> enc0_pixels, dec1_pixels; - auto pixels = run_cnn_v3(fixture, W, H, - kCnnV3TestFeat0U32, kCnnV3TestFeat1U32, - kCnnV3TestWeightsU32, weights_bytes, - &enc0_pixels, &dec1_pixels); + auto pixels = run_cnn_v3(fixture, W, H, kCnnV3TestFeat0U32, + kCnnV3TestFeat1U32, kCnnV3TestWeightsU32, + weights_bytes, &enc0_pixels, &dec1_pixels); // Check enc0 layer first (8ch, rgba32uint) const float tol = 1.0f / 255.0f; @@ -376,15 +381,18 @@ static int test_random_weights() { for (int i = 0; i < W * H * 8; ++i) { float ref = fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[i]); float err = fabsf(enc0_pixels[i] - ref); - if (err > enc0_max_err) { enc0_max_err = err; enc0_worst = i; } + if (err > enc0_max_err) { + enc0_max_err = err; + enc0_worst = i; + } } bool enc0_ok = (enc0_max_err <= tol); if (!enc0_ok) { int px = enc0_worst / 8, ch = enc0_worst % 8; - fprintf(stderr, " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" + fprintf(stderr, + " ✗ enc0 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" " gpu=%.5f ref=%.5f\n", - enc0_max_err, tol, px, ch, - enc0_pixels[enc0_worst], + enc0_max_err, tol, px, ch, enc0_pixels[enc0_worst], fp16_bits_to_f32(kCnnV3ExpectedEnc0U16[enc0_worst])); } else { fprintf(stdout, " ✓ enc0: max_err=%.2e OK\n", enc0_max_err); @@ -397,15 +405,18 @@ static int test_random_weights() { for (int i = 0; i < dec1_n; ++i) { float ref = fp16_bits_to_f32(kCnnV3ExpectedDec1U16[i]); float err = fabsf(dec1_pixels[i] - ref); - if (err > dec1_max_err) { dec1_max_err = err; dec1_worst = i; } + if (err > dec1_max_err) { + dec1_max_err = err; + dec1_worst = i; + } } bool dec1_ok = (dec1_max_err <= tol); if (!dec1_ok) { int px = dec1_worst / 8, ch = dec1_worst % 8; - fprintf(stderr, " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" + fprintf(stderr, + " ✗ dec1 mismatch: max_err=%.5f > %.5f at px=%d ch=%d" " gpu=%.5f ref=%.5f\n", - dec1_max_err, tol, px, ch, - dec1_pixels[dec1_worst], + dec1_max_err, tol, px, ch, dec1_pixels[dec1_worst], fp16_bits_to_f32(kCnnV3ExpectedDec1U16[dec1_worst])); } else { fprintf(stdout, " ✓ dec1: max_err=%.2e OK\n", dec1_max_err); @@ -418,16 +429,19 @@ static int test_random_weights() { for (int i = 0; i < n; ++i) { float ref = fp16_bits_to_f32(kCnnV3ExpectedOutputU16[i]); float err = fabsf(pixels[i] - ref); - if (err > max_err) { max_err = err; worst = i; } + if (err > max_err) { + max_err = err; + worst = i; + } } bool out_ok = (max_err <= tol); if (!out_ok) { int px = worst / 4, ch = worst % 4; - fprintf(stderr, " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d" + fprintf(stderr, + " ✗ random_weights: max_err=%.5f > %.5f at px=%d ch=%d" " gpu=%.5f ref=%.5f\n", - max_err, tol, px, ch, - pixels[worst], + max_err, tol, px, ch, pixels[worst], fp16_bits_to_f32(kCnnV3ExpectedOutputU16[worst])); } else { fprintf(stdout, " ✓ random_weights: max_err=%.2e OK\n", max_err); @@ -442,8 +456,10 @@ static int test_random_weights() { int main() { int pass = 0, total = 0; - ++total; pass += test_zero_weights(); - ++total; pass += test_random_weights(); + ++total; + pass += test_zero_weights(); + ++total; + pass += test_random_weights(); fprintf(stdout, "\nCNN v3 parity: %d/%d passed\n", pass, total); return (pass == total) ? 0 : 1; |
