summaryrefslogtreecommitdiff
path: root/tools/cnn_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tools/cnn_test.cc')
-rw-r--r--tools/cnn_test.cc958
1 files changed, 957 insertions, 1 deletions
diff --git a/tools/cnn_test.cc b/tools/cnn_test.cc
index c2983a9..c504c3d 100644
--- a/tools/cnn_test.cc
+++ b/tools/cnn_test.cc
@@ -28,6 +28,7 @@
#include <cstdlib>
#include <cstring>
#include <vector>
+#include <cmath>
// Helper to get asset string or empty string
static const char* SafeGetAsset(AssetId id) {
@@ -44,6 +45,9 @@ struct Args {
const char* save_intermediates = nullptr;
int num_layers = 3; // Default to 3 layers
bool debug_hex = false; // Print first 8 pixels as hex
+ int cnn_version = 1; // 1=CNNEffect, 2=CNNv2Effect
+ const char* weights_path = nullptr; // Optional .bin weights file
+ bool cnn_version_explicit = false; // Track if --cnn-version was explicitly set
};
// Parse command-line arguments
@@ -83,6 +87,15 @@ static bool parse_args(int argc, char** argv, Args* args) {
}
} else if (strcmp(argv[i], "--debug-hex") == 0) {
args->debug_hex = true;
+ } else if (strcmp(argv[i], "--cnn-version") == 0 && i + 1 < argc) {
+ args->cnn_version = atoi(argv[++i]);
+ args->cnn_version_explicit = true;
+ if (args->cnn_version < 1 || args->cnn_version > 2) {
+ fprintf(stderr, "Error: cnn-version must be 1 or 2\n");
+ return false;
+ }
+ } else if (strcmp(argv[i], "--weights") == 0 && i + 1 < argc) {
+ args->weights_path = argv[++i];
} else if (strcmp(argv[i], "--help") == 0) {
return false;
} else {
@@ -91,6 +104,21 @@ static bool parse_args(int argc, char** argv, Args* args) {
}
}
+ // Force CNN v2 when --weights is specified
+ if (args->weights_path) {
+ if (args->cnn_version_explicit && args->cnn_version != 2) {
+ fprintf(stderr, "WARNING: --cnn-version %d ignored (--weights forces CNN v2)\n",
+ args->cnn_version);
+ }
+ args->cnn_version = 2;
+
+ // Warn if --layers was specified (binary file config takes precedence)
+ if (args->num_layers != 3) { // 3 is the default
+ fprintf(stderr, "WARNING: --layers %d ignored (--weights loads layer config from .bin)\n",
+ args->num_layers);
+ }
+ }
+
return true;
}
@@ -100,9 +128,11 @@ static void print_usage(const char* prog) {
fprintf(stderr, "\nOPTIONS:\n");
fprintf(stderr, " --blend F Final blend amount (0.0-1.0, default: 1.0)\n");
fprintf(stderr, " --format ppm|png Output format (default: png)\n");
- fprintf(stderr, " --layers N Number of CNN layers (1-10, default: 3)\n");
+ fprintf(stderr, " --layers N Number of CNN layers (1-10, default: 3, ignored with --weights)\n");
fprintf(stderr, " --save-intermediates DIR Save intermediate layers to directory\n");
fprintf(stderr, " --debug-hex Print first 8 pixels as hex (debug)\n");
+ fprintf(stderr, " --cnn-version N CNN version: 1 (default) or 2 (ignored with --weights)\n");
+ fprintf(stderr, " --weights PATH Load weights from .bin (forces CNN v2, overrides layer config)\n");
fprintf(stderr, " --help Show this help\n");
}
@@ -160,6 +190,66 @@ static WGPUTexture load_texture(WGPUDevice device, WGPUQueue queue,
return texture;
}
+// Load PNG alpha channel as depth texture (or 1.0 if no alpha)
+static WGPUTexture load_depth_from_alpha(WGPUDevice device, WGPUQueue queue,
+ const char* path, int width,
+ int height) {
+ int w, h, channels;
+ uint8_t* data = stbi_load(path, &w, &h, &channels, 4);
+ if (!data || w != width || h != height) {
+ fprintf(stderr, "Error: failed to load depth from '%s'\n", path);
+ if (data) stbi_image_free(data);
+ return nullptr;
+ }
+
+ // Extract alpha channel (or use 1.0 if original was RGB)
+ std::vector<float> depth_data(width * height);
+ bool has_alpha = (channels == 4);
+ for (int i = 0; i < width * height; ++i) {
+ // Alpha is in data[i*4+3] (0-255), convert to float [0, 1]
+ // If no alpha channel, default to 1.0 (far plane)
+ depth_data[i] = has_alpha ? (data[i * 4 + 3] / 255.0f) : 1.0f;
+ }
+ stbi_image_free(data);
+
+ // Create R32Float depth texture
+ const WGPUTextureDescriptor depth_desc = {
+ .usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst,
+ .dimension = WGPUTextureDimension_2D,
+ .size = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1},
+ .format = WGPUTextureFormat_R32Float,
+ .mipLevelCount = 1,
+ .sampleCount = 1,
+ };
+ WGPUTexture depth_texture = wgpuDeviceCreateTexture(device, &depth_desc);
+ if (!depth_texture) {
+ fprintf(stderr, "Error: failed to create depth texture\n");
+ return nullptr;
+ }
+
+ // Write depth data
+ const WGPUTexelCopyTextureInfo dst = {
+ .texture = depth_texture,
+ .mipLevel = 0
+ };
+ const WGPUTexelCopyBufferLayout layout = {
+ .bytesPerRow = static_cast<uint32_t>(width * sizeof(float)),
+ .rowsPerImage = static_cast<uint32_t>(height)
+ };
+ const WGPUExtent3D size = {
+ static_cast<uint32_t>(width),
+ static_cast<uint32_t>(height),
+ 1
+ };
+ wgpuQueueWriteTexture(queue, &dst, depth_data.data(),
+ depth_data.size() * sizeof(float), &layout, &size);
+
+ printf("Loaded depth from alpha: %dx%d (%s alpha)\n", width, height,
+ has_alpha ? "has" : "no");
+
+ return depth_texture;
+}
+
// Create CNN render pipeline (5 bindings)
// Takes both intermediate format (RGBA16Float) and final format (BGRA8Unorm)
static WGPURenderPipeline create_cnn_pipeline(WGPUDevice device,
@@ -236,6 +326,57 @@ static bool save_png(const char* path, const std::vector<uint8_t>& pixels,
return true;
}
+// Create horizontal grayscale composite of layer outputs
+// Each layer is already 4x wide (showing 4 channels), stack them vertically
+static bool save_layer_composite(const char* dir, int width, int height, int num_layers) {
+ // Each layer PNG is already 4x wide with 4 channels side-by-side
+ int layer_width = width * 4;
+
+ // Load all layer images (they're already grayscale)
+ std::vector<std::vector<uint8_t>> layers(num_layers);
+ for (int i = 0; i < num_layers; ++i) {
+ char path[512];
+ snprintf(path, sizeof(path), "%s/layer_%d.png", dir, i);
+
+ int w, h, channels;
+ uint8_t* data = stbi_load(path, &w, &h, &channels, 1); // Load as grayscale
+ if (!data || w != layer_width || h != height) {
+ if (data) stbi_image_free(data);
+ fprintf(stderr, "Warning: failed to load layer %d for composite (expected %dx%d, got %dx%d)\n",
+ i, layer_width, height, w, h);
+ return false;
+ }
+
+ layers[i].assign(data, data + (layer_width * height));
+ stbi_image_free(data);
+ }
+
+ // Stack layers vertically
+ int composite_height = height * num_layers;
+ std::vector<uint8_t> composite(layer_width * composite_height);
+
+ for (int layer = 0; layer < num_layers; ++layer) {
+ for (int y = 0; y < height; ++y) {
+ int src_row_offset = y * layer_width;
+ int dst_row_offset = (layer * height + y) * layer_width;
+ memcpy(&composite[dst_row_offset], &layers[layer][src_row_offset], layer_width);
+ }
+ }
+
+ // Save as grayscale PNG (stacked vertically)
+ char composite_path[512];
+ snprintf(composite_path, sizeof(composite_path), "%s/layers_composite.png", dir);
+ if (!stbi_write_png(composite_path, layer_width, composite_height, 1,
+ composite.data(), layer_width)) {
+ fprintf(stderr, "Error: failed to write composite PNG\n");
+ return false;
+ }
+
+ printf("Saved layer composite to '%s' (%dx%d, 4 layers stacked vertically)\n",
+ composite_path, layer_width, composite_height);
+ return true;
+}
+
// Save PPM output (fallback)
static bool save_ppm(const char* path, const std::vector<uint8_t>& pixels,
int width, int height) {
@@ -257,6 +398,808 @@ static bool save_ppm(const char* path, const std::vector<uint8_t>& pixels,
return true;
}
+// CNN v2 structures (matching CNNv2Effect)
+struct CNNv2LayerInfo {
+ uint32_t kernel_size;
+ uint32_t in_channels;
+ uint32_t out_channels;
+ uint32_t weight_offset;
+ uint32_t weight_count;
+};
+
+struct CNNv2LayerParams {
+ uint32_t kernel_size;
+ uint32_t in_channels;
+ uint32_t out_channels;
+ uint32_t weight_offset;
+ uint32_t is_output_layer;
+ float blend_amount;
+ uint32_t is_layer_0;
+};
+
+struct CNNv2StaticFeatureParams {
+ uint32_t mip_level;
+ uint32_t padding[3];
+};
+
+// Convert RGBA32Uint (packed f16) texture to BGRA8Unorm
+static std::vector<uint8_t> readback_rgba32uint_to_bgra8(
+ WGPUDevice device, WGPUQueue queue, WGPUTexture texture,
+ int width, int height) {
+ // Create staging buffer
+ const uint32_t bytes_per_row = width * 16; // 4×u32 per pixel
+ const uint32_t padded_bytes_per_row = (bytes_per_row + 255) & ~255;
+ const size_t buffer_size = padded_bytes_per_row * height;
+
+ WGPUBufferDescriptor buffer_desc = {};
+ buffer_desc.size = buffer_size;
+ buffer_desc.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead;
+ buffer_desc.mappedAtCreation = false;
+
+ WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &buffer_desc);
+
+ // Copy texture to buffer
+ WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+
+ WGPUTexelCopyTextureInfo src = {};
+ src.texture = texture;
+ src.mipLevel = 0;
+
+ WGPUTexelCopyBufferInfo dst = {};
+ dst.buffer = staging;
+ dst.layout.bytesPerRow = padded_bytes_per_row;
+ dst.layout.rowsPerImage = height;
+
+ WGPUExtent3D copy_size = {
+ static_cast<uint32_t>(width),
+ static_cast<uint32_t>(height),
+ 1};
+
+ wgpuCommandEncoderCopyTextureToBuffer(encoder, &src, &dst, &copy_size);
+
+ WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+ wgpuQueueSubmit(queue, 1, &commands);
+ wgpuCommandBufferRelease(commands);
+ wgpuCommandEncoderRelease(encoder);
+
+ // Wait for copy to complete
+ wgpuDevicePoll(device, true, nullptr);
+
+ // Map and read buffer
+ struct MapState {
+ bool done = false;
+ };
+ MapState map_state;
+
+ auto map_cb = [](WGPUMapAsyncStatus status, WGPUStringView message,
+ void* userdata1, void* userdata2) {
+ (void)message;
+ (void)userdata2;
+ MapState* state = (MapState*)userdata1;
+ state->done = (status == WGPUMapAsyncStatus_Success);
+ };
+
+ WGPUBufferMapCallbackInfo map_info = {};
+ map_info.mode = WGPUCallbackMode_AllowProcessEvents;
+ map_info.callback = map_cb;
+ map_info.userdata1 = &map_state;
+
+ wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, buffer_size, map_info);
+
+ // Wait for mapping to complete
+ for (int i = 0; i < 100 && !map_state.done; ++i) {
+ wgpuDevicePoll(device, true, nullptr);
+ }
+
+ if (!map_state.done) {
+ fprintf(stderr, "Error: Buffer mapping timed out\n");
+ wgpuBufferRelease(staging);
+ return std::vector<uint8_t>();
+ }
+
+ const uint32_t* mapped =
+ (const uint32_t*)wgpuBufferGetConstMappedRange(staging, 0, buffer_size);
+
+ std::vector<uint8_t> result(width * height * 4);
+
+ // Unpack f16 to u8 (BGRA)
+ for (int y = 0; y < height; ++y) {
+ const uint32_t* row =
+ (const uint32_t*)((const uint8_t*)mapped + y * padded_bytes_per_row);
+ for (int x = 0; x < width; ++x) {
+ // Read 4×u32 (8×f16)
+ uint32_t data[4];
+ data[0] = row[x * 4 + 0];
+ data[1] = row[x * 4 + 1];
+ data[2] = row[x * 4 + 2];
+ data[3] = row[x * 4 + 3];
+
+ // Extract RGBA channels (first 4 f16 values)
+ uint16_t r16 = data[0] & 0xFFFF;
+ uint16_t g16 = (data[0] >> 16) & 0xFFFF;
+ uint16_t b16 = data[1] & 0xFFFF;
+ uint16_t a16 = (data[1] >> 16) & 0xFFFF;
+
+ // Convert f16 to f32 (simple decode)
+ auto f16_to_f32 = [](uint16_t h) -> float {
+ uint32_t sign = (h >> 15) & 1;
+ uint32_t exp = (h >> 10) & 0x1F;
+ uint32_t frac = h & 0x3FF;
+
+ if (exp == 0) {
+ if (frac == 0) return sign ? -0.0f : 0.0f;
+ // Denormal
+ float val = frac / 1024.0f / 16384.0f;
+ return sign ? -val : val;
+ }
+ if (exp == 31) {
+ return frac ? NAN : (sign ? -INFINITY : INFINITY);
+ }
+
+ int32_t e = exp - 15;
+ float val = (1.0f + frac / 1024.0f) * powf(2.0f, e);
+ return sign ? -val : val;
+ };
+
+ float r = f16_to_f32(r16);
+ float g = f16_to_f32(g16);
+ float b = f16_to_f32(b16);
+ float a = f16_to_f32(a16);
+
+ // Clamp to [0,1] and convert to u8
+ auto clamp_u8 = [](float v) -> uint8_t {
+ if (v <= 0.0f) return 0;
+ if (v >= 1.0f) return 255;
+ return static_cast<uint8_t>(v * 255.0f + 0.5f);
+ };
+
+ result[(y * width + x) * 4 + 0] = clamp_u8(b);
+ result[(y * width + x) * 4 + 1] = clamp_u8(g);
+ result[(y * width + x) * 4 + 2] = clamp_u8(r);
+ result[(y * width + x) * 4 + 3] = clamp_u8(a);
+ }
+ }
+
+ wgpuBufferUnmap(staging);
+ wgpuBufferRelease(staging);
+
+ return result;
+}
+
+// Read RGBA32Uint and create 4x wide grayscale composite (each channel side-by-side)
+static std::vector<uint8_t> readback_rgba32uint_to_composite(
+ WGPUDevice device, WGPUQueue queue, WGPUTexture texture,
+ int width, int height) {
+
+ // First get BGRA8 data
+ std::vector<uint8_t> bgra = readback_rgba32uint_to_bgra8(device, queue, texture, width, height);
+ if (bgra.empty()) return {};
+
+ // Create 4x wide grayscale image (one channel per horizontal strip)
+ int composite_width = width * 4;
+ std::vector<uint8_t> composite(composite_width * height);
+
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ int src_idx = (y * width + x) * 4;
+ uint8_t b = bgra[src_idx + 0];
+ uint8_t g = bgra[src_idx + 1];
+ uint8_t r = bgra[src_idx + 2];
+ uint8_t a = bgra[src_idx + 3];
+
+ // Convert each channel to grayscale luminance
+ auto to_gray = [](uint8_t val) -> uint8_t { return val; };
+
+ // Place each channel in its horizontal strip
+ composite[y * composite_width + (0 * width + x)] = to_gray(r); // Channel 0
+ composite[y * composite_width + (1 * width + x)] = to_gray(g); // Channel 1
+ composite[y * composite_width + (2 * width + x)] = to_gray(b); // Channel 2
+ composite[y * composite_width + (3 * width + x)] = to_gray(a); // Channel 3
+ }
+ }
+
+ return composite;
+}
+
+// Process image with CNN v2
+static bool process_cnn_v2(WGPUDevice device, WGPUQueue queue,
+ WGPUInstance instance, WGPUTexture input_texture,
+ int width, int height, const Args& args) {
+ printf("Using CNN v2 (storage buffer architecture)\n");
+
+ // Load weights (from file or asset system)
+ size_t weights_size = 0;
+ const uint8_t* weights_data = nullptr;
+ std::vector<uint8_t> file_weights; // For file-based loading
+
+ if (args.weights_path) {
+ // Load from file
+ printf("Loading weights from '%s'...\n", args.weights_path);
+ FILE* f = fopen(args.weights_path, "rb");
+ if (!f) {
+ fprintf(stderr, "Error: failed to open weights file '%s'\n", args.weights_path);
+ return false;
+ }
+
+ fseek(f, 0, SEEK_END);
+ weights_size = ftell(f);
+ fseek(f, 0, SEEK_SET);
+
+ file_weights.resize(weights_size);
+ size_t read = fread(file_weights.data(), 1, weights_size, f);
+ fclose(f);
+
+ if (read != weights_size) {
+ fprintf(stderr, "Error: failed to read weights file\n");
+ return false;
+ }
+
+ weights_data = file_weights.data();
+ } else {
+ // Load from asset system
+ weights_data = (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size);
+ }
+
+ if (!weights_data || weights_size < 20) {
+ fprintf(stderr, "Error: CNN v2 weights not available\n");
+ return false;
+ }
+
+ // Parse header
+ const uint32_t* header = (const uint32_t*)weights_data;
+ uint32_t magic = header[0];
+ uint32_t version = header[1];
+ uint32_t num_layers = header[2];
+ uint32_t total_weights = header[3];
+
+ if (magic != 0x324e4e43) { // 'CNN2'
+ fprintf(stderr, "Error: Invalid CNN v2 weights magic\n");
+ return false;
+ }
+
+ uint32_t mip_level = 0;
+ if (version == 2) {
+ mip_level = header[4];
+ }
+
+ printf("Loaded CNN v2 weights: %u layers, %u weights, version %u\n",
+ num_layers, total_weights, version);
+
+ // Parse layer info
+ const uint32_t header_u32_count = (version == 1) ? 4 : 5;
+ const uint32_t* layer_data = header + header_u32_count;
+ std::vector<CNNv2LayerInfo> layer_info;
+
+ for (uint32_t i = 0; i < num_layers; ++i) {
+ CNNv2LayerInfo info;
+ info.kernel_size = layer_data[i * 5 + 0];
+ info.in_channels = layer_data[i * 5 + 1];
+ info.out_channels = layer_data[i * 5 + 2];
+ info.weight_offset = layer_data[i * 5 + 3];
+ info.weight_count = layer_data[i * 5 + 4];
+ layer_info.push_back(info);
+
+ printf(" Layer %u: %ux%u conv, %u→%u channels, %u weights\n", i,
+ info.kernel_size, info.kernel_size, info.in_channels,
+ info.out_channels, info.weight_count);
+ }
+
+ // Create weights storage buffer (skip header + layer info, upload only weights)
+ size_t header_size = 20; // 5 u32
+ size_t layer_info_size = 20 * layer_info.size(); // 5 u32 per layer
+ size_t weights_offset = header_size + layer_info_size;
+ size_t weights_only_size = weights_size - weights_offset;
+
+ WGPUBufferDescriptor weights_buffer_desc = {};
+ weights_buffer_desc.size = weights_only_size;
+ weights_buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst;
+ weights_buffer_desc.mappedAtCreation = false;
+
+ WGPUBuffer weights_buffer =
+ wgpuDeviceCreateBuffer(device, &weights_buffer_desc);
+ wgpuQueueWriteBuffer(queue, weights_buffer, 0, weights_data + weights_offset, weights_only_size);
+
+ // Create input view
+ const WGPUTextureViewDescriptor view_desc = {
+ .format = WGPUTextureFormat_BGRA8Unorm,
+ .dimension = WGPUTextureViewDimension_2D,
+ .baseMipLevel = 0,
+ .mipLevelCount = 1,
+ .baseArrayLayer = 0,
+ .arrayLayerCount = 1,
+ };
+ WGPUTextureView input_view = wgpuTextureCreateView(input_texture, &view_desc);
+
+ // Create static features texture (RGBA32Uint)
+ const WGPUTextureDescriptor static_desc = {
+ .usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopySrc,
+ .dimension = WGPUTextureDimension_2D,
+ .size = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1},
+ .format = WGPUTextureFormat_RGBA32Uint,
+ .mipLevelCount = 1,
+ .sampleCount = 1,
+ };
+ WGPUTexture static_features_tex =
+ wgpuDeviceCreateTexture(device, &static_desc);
+ WGPUTextureView static_features_view =
+ wgpuTextureCreateView(static_features_tex, nullptr);
+
+ // Load depth from input alpha channel (or 1.0 if no alpha)
+ WGPUTexture depth_texture =
+ load_depth_from_alpha(device, queue, args.input_path, width, height);
+ if (!depth_texture) {
+ wgpuTextureViewRelease(static_features_view);
+ wgpuTextureRelease(static_features_tex);
+ wgpuBufferRelease(weights_buffer);
+ wgpuTextureViewRelease(input_view);
+ return false;
+ }
+ WGPUTextureView depth_view = wgpuTextureCreateView(depth_texture, nullptr);
+
+ // Create layer textures (ping-pong)
+ WGPUTexture layer_textures[2] = {
+ wgpuDeviceCreateTexture(device, &static_desc),
+ wgpuDeviceCreateTexture(device, &static_desc),
+ };
+ WGPUTextureView layer_views[2] = {
+ wgpuTextureCreateView(layer_textures[0], nullptr),
+ wgpuTextureCreateView(layer_textures[1], nullptr),
+ };
+
+ // Load shaders
+ const char* static_shader =
+ SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC);
+ const char* layer_shader =
+ SafeGetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE);
+
+ if (!static_shader[0] || !layer_shader[0]) {
+ fprintf(stderr, "Error: CNN v2 shaders not available\n");
+ wgpuTextureViewRelease(static_features_view);
+ wgpuTextureRelease(static_features_tex);
+ wgpuTextureViewRelease(depth_view);
+ wgpuTextureRelease(depth_texture);
+ wgpuTextureViewRelease(layer_views[0]);
+ wgpuTextureViewRelease(layer_views[1]);
+ wgpuTextureRelease(layer_textures[0]);
+ wgpuTextureRelease(layer_textures[1]);
+ wgpuBufferRelease(weights_buffer);
+ wgpuTextureViewRelease(input_view);
+ return false;
+ }
+
+ // Create static feature params buffer
+ WGPUBufferDescriptor static_params_desc = {};
+ static_params_desc.size = sizeof(CNNv2StaticFeatureParams);
+ static_params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+ static_params_desc.mappedAtCreation = false;
+
+ WGPUBuffer static_params_buffer =
+ wgpuDeviceCreateBuffer(device, &static_params_desc);
+
+ CNNv2StaticFeatureParams static_params;
+ static_params.mip_level = mip_level;
+ static_params.padding[0] = 0;
+ static_params.padding[1] = 0;
+ static_params.padding[2] = 0;
+ wgpuQueueWriteBuffer(queue, static_params_buffer, 0, &static_params,
+ sizeof(static_params));
+
+ // Create static features compute pipeline
+ WGPUShaderSourceWGSL static_wgsl = {};
+ static_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+ static_wgsl.code = str_view(static_shader);
+
+ WGPUShaderModuleDescriptor static_module_desc = {};
+ static_module_desc.nextInChain = &static_wgsl.chain;
+
+ WGPUShaderModule static_module =
+ wgpuDeviceCreateShaderModule(device, &static_module_desc);
+
+ // Bind group layout: 0=input, 1=input_mip1, 2=input_mip2, 3=depth, 4=output,
+ // 5=params
+ WGPUBindGroupLayoutEntry static_bgl_entries[6] = {};
+ static_bgl_entries[0].binding = 0;
+ static_bgl_entries[0].visibility = WGPUShaderStage_Compute;
+ static_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float;
+ static_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ static_bgl_entries[1].binding = 1;
+ static_bgl_entries[1].visibility = WGPUShaderStage_Compute;
+ static_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float;
+ static_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ static_bgl_entries[2].binding = 2;
+ static_bgl_entries[2].visibility = WGPUShaderStage_Compute;
+ static_bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float;
+ static_bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ static_bgl_entries[3].binding = 3;
+ static_bgl_entries[3].visibility = WGPUShaderStage_Compute;
+ static_bgl_entries[3].texture.sampleType = WGPUTextureSampleType_UnfilterableFloat;
+ static_bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ static_bgl_entries[4].binding = 4;
+ static_bgl_entries[4].visibility = WGPUShaderStage_Compute;
+ static_bgl_entries[4].storageTexture.access =
+ WGPUStorageTextureAccess_WriteOnly;
+ static_bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
+ static_bgl_entries[4].storageTexture.viewDimension =
+ WGPUTextureViewDimension_2D;
+
+ static_bgl_entries[5].binding = 5;
+ static_bgl_entries[5].visibility = WGPUShaderStage_Compute;
+ static_bgl_entries[5].buffer.type = WGPUBufferBindingType_Uniform;
+ static_bgl_entries[5].buffer.minBindingSize =
+ sizeof(CNNv2StaticFeatureParams);
+
+ WGPUBindGroupLayoutDescriptor static_bgl_desc = {};
+ static_bgl_desc.entryCount = 6;
+ static_bgl_desc.entries = static_bgl_entries;
+
+ WGPUBindGroupLayout static_bgl =
+ wgpuDeviceCreateBindGroupLayout(device, &static_bgl_desc);
+
+ WGPUPipelineLayoutDescriptor static_pl_desc = {};
+ static_pl_desc.bindGroupLayoutCount = 1;
+ static_pl_desc.bindGroupLayouts = &static_bgl;
+
+ WGPUPipelineLayout static_pl =
+ wgpuDeviceCreatePipelineLayout(device, &static_pl_desc);
+
+ WGPUComputePipelineDescriptor static_pipeline_desc = {};
+ static_pipeline_desc.compute.module = static_module;
+ static_pipeline_desc.compute.entryPoint = str_view("main");
+ static_pipeline_desc.layout = static_pl;
+
+ WGPUComputePipeline static_pipeline =
+ wgpuDeviceCreateComputePipeline(device, &static_pipeline_desc);
+
+ wgpuShaderModuleRelease(static_module);
+ wgpuPipelineLayoutRelease(static_pl);
+
+ // Create static bind group (use input as all mips for simplicity)
+ WGPUBindGroupEntry static_bg_entries[6] = {};
+ static_bg_entries[0].binding = 0;
+ static_bg_entries[0].textureView = input_view;
+ static_bg_entries[1].binding = 1;
+ static_bg_entries[1].textureView = input_view;
+ static_bg_entries[2].binding = 2;
+ static_bg_entries[2].textureView = input_view;
+ static_bg_entries[3].binding = 3;
+ static_bg_entries[3].textureView = depth_view; // Depth from alpha channel (matches training)
+ static_bg_entries[4].binding = 4;
+ static_bg_entries[4].textureView = static_features_view;
+ static_bg_entries[5].binding = 5;
+ static_bg_entries[5].buffer = static_params_buffer;
+ static_bg_entries[5].size = sizeof(CNNv2StaticFeatureParams);
+
+ WGPUBindGroupDescriptor static_bg_desc = {};
+ static_bg_desc.layout = static_bgl;
+ static_bg_desc.entryCount = 6;
+ static_bg_desc.entries = static_bg_entries;
+
+ WGPUBindGroup static_bg = wgpuDeviceCreateBindGroup(device, &static_bg_desc);
+
+ wgpuBindGroupLayoutRelease(static_bgl);
+
+ // Create layer compute pipeline
+ WGPUShaderSourceWGSL layer_wgsl = {};
+ layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+ layer_wgsl.code = str_view(layer_shader);
+
+ WGPUShaderModuleDescriptor layer_module_desc = {};
+ layer_module_desc.nextInChain = &layer_wgsl.chain;
+
+ WGPUShaderModule layer_module =
+ wgpuDeviceCreateShaderModule(device, &layer_module_desc);
+
+ // Layer bind group layout:
+ // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params,
+ // 5=original
+ WGPUBindGroupLayoutEntry layer_bgl_entries[6] = {};
+ layer_bgl_entries[0].binding = 0;
+ layer_bgl_entries[0].visibility = WGPUShaderStage_Compute;
+ layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint;
+ layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ layer_bgl_entries[1].binding = 1;
+ layer_bgl_entries[1].visibility = WGPUShaderStage_Compute;
+ layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint;
+ layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ layer_bgl_entries[2].binding = 2;
+ layer_bgl_entries[2].visibility = WGPUShaderStage_Compute;
+ layer_bgl_entries[2].storageTexture.access =
+ WGPUStorageTextureAccess_WriteOnly;
+ layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint;
+ layer_bgl_entries[2].storageTexture.viewDimension =
+ WGPUTextureViewDimension_2D;
+
+ layer_bgl_entries[3].binding = 3;
+ layer_bgl_entries[3].visibility = WGPUShaderStage_Compute;
+ layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+ layer_bgl_entries[4].binding = 4;
+ layer_bgl_entries[4].visibility = WGPUShaderStage_Compute;
+ layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+ layer_bgl_entries[4].buffer.minBindingSize = sizeof(CNNv2LayerParams);
+
+ layer_bgl_entries[5].binding = 5;
+ layer_bgl_entries[5].visibility = WGPUShaderStage_Compute;
+ layer_bgl_entries[5].texture.sampleType = WGPUTextureSampleType_Float;
+ layer_bgl_entries[5].texture.viewDimension = WGPUTextureViewDimension_2D;
+
+ WGPUBindGroupLayoutDescriptor layer_bgl_desc = {};
+ layer_bgl_desc.entryCount = 6;
+ layer_bgl_desc.entries = layer_bgl_entries;
+
+ WGPUBindGroupLayout layer_bgl =
+ wgpuDeviceCreateBindGroupLayout(device, &layer_bgl_desc);
+
+ WGPUPipelineLayoutDescriptor layer_pl_desc = {};
+ layer_pl_desc.bindGroupLayoutCount = 1;
+ layer_pl_desc.bindGroupLayouts = &layer_bgl;
+
+ WGPUPipelineLayout layer_pl =
+ wgpuDeviceCreatePipelineLayout(device, &layer_pl_desc);
+
+ WGPUComputePipelineDescriptor layer_pipeline_desc = {};
+ layer_pipeline_desc.compute.module = layer_module;
+ layer_pipeline_desc.compute.entryPoint = str_view("main");
+ layer_pipeline_desc.layout = layer_pl;
+
+ WGPUComputePipeline layer_pipeline =
+ wgpuDeviceCreateComputePipeline(device, &layer_pipeline_desc);
+
+ wgpuShaderModuleRelease(layer_module);
+ wgpuPipelineLayoutRelease(layer_pl);
+
+ // Create layer params buffers
+ std::vector<WGPUBuffer> layer_params_buffers;
+ for (size_t i = 0; i < layer_info.size(); ++i) {
+ WGPUBufferDescriptor params_desc = {};
+ params_desc.size = sizeof(CNNv2LayerParams);
+ params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+ params_desc.mappedAtCreation = false;
+
+ WGPUBuffer buf = wgpuDeviceCreateBuffer(device, &params_desc);
+ layer_params_buffers.push_back(buf);
+ }
+
+ // Execute compute passes
+ WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+
+ // Pass 1: Static features
+ printf("Computing static features...\n");
+ WGPUComputePassEncoder static_pass =
+ wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+ wgpuComputePassEncoderSetPipeline(static_pass, static_pipeline);
+ wgpuComputePassEncoderSetBindGroup(static_pass, 0, static_bg, 0, nullptr);
+
+ uint32_t workgroups_x = (width + 7) / 8;
+ uint32_t workgroups_y = (height + 7) / 8;
+ wgpuComputePassEncoderDispatchWorkgroups(static_pass, workgroups_x,
+ workgroups_y, 1);
+
+ wgpuComputePassEncoderEnd(static_pass);
+ wgpuComputePassEncoderRelease(static_pass);
+
+ // Save static features if requested
+ if (args.save_intermediates) {
+ // Submit and wait for static features to complete
+ WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr);
+ wgpuQueueSubmit(queue, 1, &cmd);
+ wgpuCommandBufferRelease(cmd);
+ wgpuDevicePoll(device, true, nullptr);
+
+ // Create new encoder for layers
+ encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+
+ char layer_path[512];
+ snprintf(layer_path, sizeof(layer_path), "%s/static_features.png",
+ args.save_intermediates);
+ printf("Saving static features to '%s'...\n", layer_path);
+
+ // Read back RGBA32Uint and create 8-channel grayscale composite
+ // Static features has 8 channels (packed as 4×u32), create 8x wide composite
+ std::vector<uint8_t> bgra = readback_rgba32uint_to_bgra8(
+ device, queue, static_features_tex, width, height);
+
+ if (!bgra.empty()) {
+ // Static features: 8 f16 values packed in 4×u32
+ // For now, just show first 4 channels (like layers)
+ // TODO: Show all 8 channels in 8x wide composite
+ std::vector<uint8_t> composite = readback_rgba32uint_to_composite(
+ device, queue, static_features_tex, width, height);
+ if (!composite.empty()) {
+ int composite_width = width * 4;
+ if (!stbi_write_png(layer_path, composite_width, height, 1,
+ composite.data(), composite_width)) {
+ fprintf(stderr, "Error: failed to write static features PNG\n");
+ }
+ }
+ }
+ }
+
+ // Pass 2-N: CNN layers
+ for (size_t i = 0; i < layer_info.size(); ++i) {
+ const CNNv2LayerInfo& info = layer_info[i];
+
+ printf("Processing layer %zu/%zu (%ux%u, %u→%u channels)...\n", i + 1,
+ layer_info.size(), info.kernel_size, info.kernel_size,
+ info.in_channels, info.out_channels);
+
+ // Update layer params
+ CNNv2LayerParams params;
+ params.kernel_size = info.kernel_size;
+ params.in_channels = info.in_channels;
+ params.out_channels = info.out_channels;
+ params.weight_offset = info.weight_offset;
+ params.is_output_layer = (i == layer_info.size() - 1) ? 1 : 0;
+ params.blend_amount = args.blend;
+ params.is_layer_0 = (i == 0) ? 1 : 0;
+
+ wgpuQueueWriteBuffer(queue, layer_params_buffers[i], 0, &params,
+ sizeof(params));
+
+ // Create bind group for this layer
+ WGPUBindGroupEntry layer_bg_entries[6] = {};
+ layer_bg_entries[0].binding = 0;
+ layer_bg_entries[0].textureView = static_features_view;
+
+ layer_bg_entries[1].binding = 1;
+ layer_bg_entries[1].textureView =
+ (i == 0) ? static_features_view : layer_views[i % 2];
+
+ layer_bg_entries[2].binding = 2;
+ layer_bg_entries[2].textureView = layer_views[(i + 1) % 2];
+
+ layer_bg_entries[3].binding = 3;
+ layer_bg_entries[3].buffer = weights_buffer;
+ layer_bg_entries[3].size = weights_only_size;
+
+ layer_bg_entries[4].binding = 4;
+ layer_bg_entries[4].buffer = layer_params_buffers[i];
+ layer_bg_entries[4].size = sizeof(CNNv2LayerParams);
+
+ layer_bg_entries[5].binding = 5;
+ layer_bg_entries[5].textureView = input_view;
+
+ WGPUBindGroupDescriptor layer_bg_desc = {};
+ layer_bg_desc.layout = layer_bgl;
+ layer_bg_desc.entryCount = 6;
+ layer_bg_desc.entries = layer_bg_entries;
+
+ WGPUBindGroup layer_bg =
+ wgpuDeviceCreateBindGroup(device, &layer_bg_desc);
+
+ WGPUComputePassEncoder layer_pass =
+ wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+ wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline);
+ wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bg, 0, nullptr);
+
+ wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x,
+ workgroups_y, 1);
+
+ wgpuComputePassEncoderEnd(layer_pass);
+ wgpuComputePassEncoderRelease(layer_pass);
+ wgpuBindGroupRelease(layer_bg);
+
+ // Save intermediate layer if requested
+ if (args.save_intermediates) {
+ // Submit and wait for layer to complete
+ WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, nullptr);
+ wgpuQueueSubmit(queue, 1, &cmd);
+ wgpuCommandBufferRelease(cmd);
+ wgpuDevicePoll(device, true, nullptr);
+
+ // Create new encoder for next layer
+ encoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
+
+ char layer_path[512];
+ snprintf(layer_path, sizeof(layer_path), "%s/layer_%zu.png",
+ args.save_intermediates, i);
+ printf("Saving intermediate layer %zu to '%s'...\n", i, layer_path);
+
+ // Read back RGBA32Uint and create 4-channel grayscale composite
+ WGPUTexture output_tex = layer_textures[(i + 1) % 2];
+ std::vector<uint8_t> composite = readback_rgba32uint_to_composite(
+ device, queue, output_tex, width, height);
+
+ if (!composite.empty()) {
+ int composite_width = width * 4;
+ if (!stbi_write_png(layer_path, composite_width, height, 1,
+ composite.data(), composite_width)) {
+ fprintf(stderr, "Error: failed to write layer PNG\n");
+ }
+ }
+ }
+ }
+
+ WGPUCommandBuffer commands = wgpuCommandEncoderFinish(encoder, nullptr);
+ wgpuQueueSubmit(queue, 1, &commands);
+ wgpuCommandBufferRelease(commands);
+ wgpuCommandEncoderRelease(encoder);
+
+ wgpuDevicePoll(device, true, nullptr);
+
+ // Create layer composite if intermediates were saved
+ if (args.save_intermediates) {
+ save_layer_composite(args.save_intermediates, width, height, layer_info.size());
+ }
+
+ // Readback final result (from last layer's output texture)
+ printf("Reading pixels from GPU...\n");
+ size_t final_layer_idx = (layer_info.size()) % 2;
+ std::vector<uint8_t> pixels = readback_rgba32uint_to_bgra8(
+ device, queue, layer_textures[final_layer_idx], width, height);
+
+ if (pixels.empty()) {
+ fprintf(stderr, "Error: GPU readback failed\n");
+ for (auto buf : layer_params_buffers) wgpuBufferRelease(buf);
+ wgpuComputePipelineRelease(layer_pipeline);
+ wgpuBindGroupLayoutRelease(layer_bgl);
+ wgpuBindGroupRelease(static_bg);
+ wgpuComputePipelineRelease(static_pipeline);
+ wgpuBufferRelease(static_params_buffer);
+ wgpuTextureViewRelease(static_features_view);
+ wgpuTextureRelease(static_features_tex);
+ wgpuTextureViewRelease(depth_view);
+ wgpuTextureRelease(depth_texture);
+ wgpuTextureViewRelease(layer_views[0]);
+ wgpuTextureViewRelease(layer_views[1]);
+ wgpuTextureRelease(layer_textures[0]);
+ wgpuTextureRelease(layer_textures[1]);
+ wgpuBufferRelease(weights_buffer);
+ wgpuTextureViewRelease(input_view);
+ return false;
+ }
+
+ // Debug hex dump
+ if (args.debug_hex) {
+ printf("First 8 pixels (BGRA hex):\n");
+ for (int i = 0; i < 8 && i < width * height; ++i) {
+ const uint8_t b = pixels[i * 4 + 0];
+ const uint8_t g = pixels[i * 4 + 1];
+ const uint8_t r = pixels[i * 4 + 2];
+ const uint8_t a = pixels[i * 4 + 3];
+ printf(" [%d] 0x%02X%02X%02X%02X (RGBA)\n", i, r, g, b, a);
+ }
+ }
+
+ // Save output
+ bool success;
+ if (args.output_png) {
+ printf("Saving PNG to '%s'...\n", args.output_path);
+ success = save_png(args.output_path, pixels, width, height);
+ } else {
+ printf("Saving PPM to '%s'...\n", args.output_path);
+ success = save_ppm(args.output_path, pixels, width, height);
+ }
+
+ if (success) {
+ printf("Done! Output saved to '%s'\n", args.output_path);
+ }
+
+ // Cleanup
+ for (auto buf : layer_params_buffers) wgpuBufferRelease(buf);
+ wgpuComputePipelineRelease(layer_pipeline);
+ wgpuBindGroupLayoutRelease(layer_bgl);
+ wgpuBindGroupRelease(static_bg);
+ wgpuComputePipelineRelease(static_pipeline);
+ wgpuBufferRelease(static_params_buffer);
+ wgpuTextureViewRelease(static_features_view);
+ wgpuTextureRelease(static_features_tex);
+ wgpuTextureViewRelease(layer_views[0]);
+ wgpuTextureViewRelease(layer_views[1]);
+ wgpuTextureRelease(layer_textures[0]);
+ wgpuTextureRelease(layer_textures[1]);
+ wgpuBufferRelease(weights_buffer);
+ wgpuTextureViewRelease(input_view);
+
+ return success;
+}
+
int main(int argc, char** argv) {
// Parse arguments
Args args;
@@ -292,6 +1235,19 @@ int main(int argc, char** argv) {
printf("Loaded %dx%d image from '%s'\n", width, height, args.input_path);
+ // Branch based on CNN version
+ if (args.cnn_version == 2) {
+ bool success = process_cnn_v2(device, queue, instance, input_texture,
+ width, height, args);
+ wgpuTextureRelease(input_texture);
+ SamplerCache::Get().clear();
+ fixture.shutdown();
+ return success ? 0 : 1;
+ }
+
+ // CNN v1 processing below
+ printf("Using CNN v1 (render pipeline architecture)\n");
+
// Create input texture view
const WGPUTextureViewDescriptor view_desc = {
.format = WGPUTextureFormat_BGRA8Unorm,