diff options
Diffstat (limited to 'src/gpu/effects/cnn_v2_effect.cc')
| -rw-r--r-- | src/gpu/effects/cnn_v2_effect.cc | 519 |
1 files changed, 519 insertions, 0 deletions
diff --git a/src/gpu/effects/cnn_v2_effect.cc b/src/gpu/effects/cnn_v2_effect.cc new file mode 100644 index 0000000..9cb6d57 --- /dev/null +++ b/src/gpu/effects/cnn_v2_effect.cc @@ -0,0 +1,519 @@ +// CNN v2 Effect Implementation + +#include "gpu/effects/cnn_v2_effect.h" + +#if defined(USE_TEST_ASSETS) +#include "test_assets.h" +#else +#include "generated/assets.h" +#endif + +#include "util/asset_manager.h" +#include "util/fatal_error.h" +#include <cstring> + +CNNv2Effect::CNNv2Effect(const GpuContext& ctx) + : PostProcessEffect(ctx), + static_pipeline_(nullptr), + static_bind_group_(nullptr), + static_features_tex_(nullptr), + static_features_view_(nullptr), + layer_pipeline_(nullptr), + weights_buffer_(nullptr), + layer_params_buffer_(nullptr), + input_mip_tex_(nullptr), + current_input_view_(nullptr), + initialized_(false) { + std::memset(input_mip_view_, 0, sizeof(input_mip_view_)); +} + +CNNv2Effect::~CNNv2Effect() { + cleanup(); +} + +void CNNv2Effect::init(MainSequence* demo) { + (void)demo; + if (initialized_) return; + + load_weights(); + create_textures(); + create_pipelines(); + + initialized_ = true; +} + +void CNNv2Effect::resize(int width, int height) { + PostProcessEffect::resize(width, height); + cleanup(); + create_textures(); + create_pipelines(); +} + +void CNNv2Effect::load_weights() { + // Load binary weights asset + size_t weights_size = 0; + const uint8_t* weights_data = (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size); + + if (!weights_data || weights_size < 16) { + // Weights not available - effect will skip + return; + } + + // Parse header (16 bytes) + const uint32_t* header = (const uint32_t*)weights_data; + uint32_t magic = header[0]; + uint32_t version = header[1]; + uint32_t num_layers = header[2]; + uint32_t total_weights = header[3]; + + FATAL_CHECK(magic != 0x324e4e43, "Invalid CNN v2 weights magic\n"); // 'CNN2' + FATAL_CHECK(version != 1, "Unsupported CNN v2 weights version\n"); + + // Parse layer info (20 bytes per layer) + const uint32_t* layer_data = header + 4; + for (uint32_t i = 0; i < num_layers; ++i) { + LayerInfo info; + info.kernel_size = layer_data[i * 5 + 0]; + info.in_channels = layer_data[i * 5 + 1]; + info.out_channels = layer_data[i * 5 + 2]; + info.weight_offset = layer_data[i * 5 + 3]; + info.weight_count = layer_data[i * 5 + 4]; + layer_info_.push_back(info); + } + + // Create GPU storage buffer for weights + // Buffer contains: header + layer info + packed f16 weights (as u32) + WGPUBufferDescriptor buffer_desc = {}; + buffer_desc.size = weights_size; + buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst; + buffer_desc.mappedAtCreation = false; + + weights_buffer_ = wgpuDeviceCreateBuffer(ctx_.device, &buffer_desc); + + // Upload weights data + wgpuQueueWriteBuffer(ctx_.queue, weights_buffer_, 0, weights_data, weights_size); + + // Create uniform buffer for layer params + WGPUBufferDescriptor params_desc = {}; + params_desc.size = sizeof(LayerParams); + params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst; + params_desc.mappedAtCreation = false; + + layer_params_buffer_ = wgpuDeviceCreateBuffer(ctx_.device, ¶ms_desc); +} + +void CNNv2Effect::create_textures() { + const WGPUExtent3D size = { + static_cast<uint32_t>(width_), + static_cast<uint32_t>(height_), + 1 + }; + + // Static features texture (8×f16 packed as 4×u32) + WGPUTextureDescriptor static_desc = {}; + static_desc.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding; + static_desc.dimension = WGPUTextureDimension_2D; + static_desc.size = size; + static_desc.format = WGPUTextureFormat_RGBA32Uint; + static_desc.mipLevelCount = 1; + static_desc.sampleCount = 1; + static_features_tex_ = wgpuDeviceCreateTexture(ctx_.device, &static_desc); + + WGPUTextureViewDescriptor view_desc = {}; + view_desc.format = WGPUTextureFormat_RGBA32Uint; + view_desc.dimension = WGPUTextureViewDimension_2D; + view_desc.baseMipLevel = 0; + view_desc.mipLevelCount = 1; + view_desc.baseArrayLayer = 0; + view_desc.arrayLayerCount = 1; + static_features_view_ = wgpuTextureCreateView(static_features_tex_, &view_desc); + + // Input texture with mips (for multi-scale features) + WGPUTextureDescriptor input_mip_desc = {}; + input_mip_desc.usage = WGPUTextureUsage_TextureBinding | WGPUTextureUsage_CopyDst; + input_mip_desc.dimension = WGPUTextureDimension_2D; + input_mip_desc.size = size; + input_mip_desc.format = WGPUTextureFormat_RGBA8Unorm; + input_mip_desc.mipLevelCount = 3; // Levels 0, 1, 2 + input_mip_desc.sampleCount = 1; + input_mip_tex_ = wgpuDeviceCreateTexture(ctx_.device, &input_mip_desc); + + for (int i = 0; i < 3; ++i) { + WGPUTextureViewDescriptor mip_view_desc = {}; + mip_view_desc.format = WGPUTextureFormat_RGBA8Unorm; + mip_view_desc.dimension = WGPUTextureViewDimension_2D; + mip_view_desc.baseMipLevel = i; + mip_view_desc.mipLevelCount = 1; + mip_view_desc.baseArrayLayer = 0; + mip_view_desc.arrayLayerCount = 1; + input_mip_view_[i] = wgpuTextureCreateView(input_mip_tex_, &mip_view_desc); + } + + // Create 2 layer textures (ping-pong buffers for intermediate results) + // Each stores 8×f16 channels packed as 4×u32 + for (int i = 0; i < 2; ++i) { + WGPUTextureDescriptor layer_desc = {}; + layer_desc.usage = WGPUTextureUsage_StorageBinding | WGPUTextureUsage_TextureBinding; + layer_desc.dimension = WGPUTextureDimension_2D; + layer_desc.size = size; + layer_desc.format = WGPUTextureFormat_RGBA32Uint; + layer_desc.mipLevelCount = 1; + layer_desc.sampleCount = 1; + + WGPUTexture tex = wgpuDeviceCreateTexture(ctx_.device, &layer_desc); + layer_textures_.push_back(tex); + + WGPUTextureViewDescriptor view_desc = {}; + view_desc.format = WGPUTextureFormat_RGBA32Uint; + view_desc.dimension = WGPUTextureViewDimension_2D; + view_desc.baseMipLevel = 0; + view_desc.mipLevelCount = 1; + view_desc.baseArrayLayer = 0; + view_desc.arrayLayerCount = 1; + + WGPUTextureView view = wgpuTextureCreateView(tex, &view_desc); + layer_views_.push_back(view); + } +} + +void CNNv2Effect::create_pipelines() { + // Static features compute pipeline + size_t shader_size = 0; + const char* static_code = (const char*)GetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC, &shader_size); + + if (!static_code || shader_size == 0) { + // Shader not available (e.g., in test mode) - skip pipeline creation + return; + } + + WGPUShaderSourceWGSL wgsl_src = {}; + wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_src.code = str_view(static_code); + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_src.chain; + + WGPUShaderModule static_module = wgpuDeviceCreateShaderModule(ctx_.device, &shader_desc); + if (!static_module) { + return; + } + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.compute.module = static_module; + pipeline_desc.compute.entryPoint = str_view("main"); + + static_pipeline_ = wgpuDeviceCreateComputePipeline(ctx_.device, &pipeline_desc); + wgpuShaderModuleRelease(static_module); + + // Create bind group layout for static features compute + // Bindings: 0=input_tex, 1=input_mip1, 2=input_mip2, 3=depth_tex, 4=output + WGPUBindGroupLayoutEntry bgl_entries[5] = {}; + + // Binding 0: Input texture (mip 0) + bgl_entries[0].binding = 0; + bgl_entries[0].visibility = WGPUShaderStage_Compute; + bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Float; + bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 1: Input texture (mip 1) + bgl_entries[1].binding = 1; + bgl_entries[1].visibility = WGPUShaderStage_Compute; + bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Float; + bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 2: Input texture (mip 2) + bgl_entries[2].binding = 2; + bgl_entries[2].visibility = WGPUShaderStage_Compute; + bgl_entries[2].texture.sampleType = WGPUTextureSampleType_Float; + bgl_entries[2].texture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 3: Depth texture + bgl_entries[3].binding = 3; + bgl_entries[3].visibility = WGPUShaderStage_Compute; + bgl_entries[3].texture.sampleType = WGPUTextureSampleType_Float; + bgl_entries[3].texture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 4: Output (static features) + bgl_entries[4].binding = 4; + bgl_entries[4].visibility = WGPUShaderStage_Compute; + bgl_entries[4].storageTexture.access = WGPUStorageTextureAccess_WriteOnly; + bgl_entries[4].storageTexture.format = WGPUTextureFormat_RGBA32Uint; + bgl_entries[4].storageTexture.viewDimension = WGPUTextureViewDimension_2D; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 5; + bgl_desc.entries = bgl_entries; + + WGPUBindGroupLayout static_bgl = wgpuDeviceCreateBindGroupLayout(ctx_.device, &bgl_desc); + + // Update pipeline layout + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &static_bgl; + WGPUPipelineLayout pipeline_layout = wgpuDeviceCreatePipelineLayout(ctx_.device, &pl_desc); + + // Recreate pipeline with proper layout + WGPUComputePipelineDescriptor pipeline_desc2 = {}; + pipeline_desc2.compute.module = wgpuDeviceCreateShaderModule(ctx_.device, &shader_desc); + pipeline_desc2.compute.entryPoint = str_view("main"); + pipeline_desc2.layout = pipeline_layout; + + if (static_pipeline_) wgpuComputePipelineRelease(static_pipeline_); + static_pipeline_ = wgpuDeviceCreateComputePipeline(ctx_.device, &pipeline_desc2); + + wgpuShaderModuleRelease(pipeline_desc2.compute.module); + wgpuPipelineLayoutRelease(pipeline_layout); + wgpuBindGroupLayoutRelease(static_bgl); + + // CNN layer compute pipeline (storage buffer version) + if (layer_info_.empty()) return; // No weights loaded + + size_t layer_shader_size = 0; + const char* layer_code = (const char*)GetAsset(AssetId::ASSET_SHADER_CNN_V2_COMPUTE, &layer_shader_size); + + if (!layer_code || layer_shader_size == 0) return; + + WGPUShaderSourceWGSL layer_wgsl = {}; + layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL; + layer_wgsl.code = str_view(layer_code); + + WGPUShaderModuleDescriptor layer_shader_desc = {}; + layer_shader_desc.nextInChain = &layer_wgsl.chain; + + WGPUShaderModule layer_module = wgpuDeviceCreateShaderModule(ctx_.device, &layer_shader_desc); + if (!layer_module) return; + + // Create bind group layout for layer compute + // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params + WGPUBindGroupLayoutEntry layer_bgl_entries[5] = {}; + + // Binding 0: Static features (texture) + layer_bgl_entries[0].binding = 0; + layer_bgl_entries[0].visibility = WGPUShaderStage_Compute; + layer_bgl_entries[0].texture.sampleType = WGPUTextureSampleType_Uint; + layer_bgl_entries[0].texture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 1: Layer input (texture) + layer_bgl_entries[1].binding = 1; + layer_bgl_entries[1].visibility = WGPUShaderStage_Compute; + layer_bgl_entries[1].texture.sampleType = WGPUTextureSampleType_Uint; + layer_bgl_entries[1].texture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 2: Output (storage texture) + layer_bgl_entries[2].binding = 2; + layer_bgl_entries[2].visibility = WGPUShaderStage_Compute; + layer_bgl_entries[2].storageTexture.access = WGPUStorageTextureAccess_WriteOnly; + layer_bgl_entries[2].storageTexture.format = WGPUTextureFormat_RGBA32Uint; + layer_bgl_entries[2].storageTexture.viewDimension = WGPUTextureViewDimension_2D; + + // Binding 3: Weights (storage buffer) + layer_bgl_entries[3].binding = 3; + layer_bgl_entries[3].visibility = WGPUShaderStage_Compute; + layer_bgl_entries[3].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + + // Binding 4: Layer params (uniform buffer) + layer_bgl_entries[4].binding = 4; + layer_bgl_entries[4].visibility = WGPUShaderStage_Compute; + layer_bgl_entries[4].buffer.type = WGPUBufferBindingType_Uniform; + layer_bgl_entries[4].buffer.minBindingSize = sizeof(LayerParams); + + WGPUBindGroupLayoutDescriptor layer_bgl_desc = {}; + layer_bgl_desc.entryCount = 5; + layer_bgl_desc.entries = layer_bgl_entries; + + WGPUBindGroupLayout layer_bgl = wgpuDeviceCreateBindGroupLayout(ctx_.device, &layer_bgl_desc); + + WGPUPipelineLayoutDescriptor layer_pl_desc = {}; + layer_pl_desc.bindGroupLayoutCount = 1; + layer_pl_desc.bindGroupLayouts = &layer_bgl; + + WGPUPipelineLayout layer_pipeline_layout = wgpuDeviceCreatePipelineLayout(ctx_.device, &layer_pl_desc); + + WGPUComputePipelineDescriptor layer_pipeline_desc = {}; + layer_pipeline_desc.compute.module = layer_module; + layer_pipeline_desc.compute.entryPoint = str_view("main"); + layer_pipeline_desc.layout = layer_pipeline_layout; + + layer_pipeline_ = wgpuDeviceCreateComputePipeline(ctx_.device, &layer_pipeline_desc); + + wgpuShaderModuleRelease(layer_module); + wgpuPipelineLayoutRelease(layer_pipeline_layout); + wgpuBindGroupLayoutRelease(layer_bgl); +} + +void CNNv2Effect::update_bind_group(WGPUTextureView input_view) { + if (!static_pipeline_) return; + + // Cache input view + current_input_view_ = input_view; + + // Release old bind group + if (static_bind_group_) { + wgpuBindGroupRelease(static_bind_group_); + static_bind_group_ = nullptr; + } + + // Create bind group for static features compute + WGPUBindGroupEntry bg_entries[5] = {}; + + // Binding 0: Input (mip 0) + bg_entries[0].binding = 0; + bg_entries[0].textureView = input_view; + + // Binding 1: Input (mip 1) + bg_entries[1].binding = 1; + bg_entries[1].textureView = input_mip_view_[0]; // Use mip 0 for now + + // Binding 2: Input (mip 2) + bg_entries[2].binding = 2; + bg_entries[2].textureView = input_mip_view_[0]; // Use mip 0 for now + + // Binding 3: Depth (use input for now, no depth available) + bg_entries[3].binding = 3; + bg_entries[3].textureView = input_view; + + // Binding 4: Output (static features) + bg_entries[4].binding = 4; + bg_entries[4].textureView = static_features_view_; + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = wgpuComputePipelineGetBindGroupLayout(static_pipeline_, 0); + bg_desc.entryCount = 5; + bg_desc.entries = bg_entries; + + static_bind_group_ = wgpuDeviceCreateBindGroup(ctx_.device, &bg_desc); + + wgpuBindGroupLayoutRelease(bg_desc.layout); + + // Create layer bind groups + if (!layer_pipeline_ || layer_info_.empty()) return; + + // Release old layer bind groups + for (auto bg : layer_bind_groups_) { + wgpuBindGroupRelease(bg); + } + layer_bind_groups_.clear(); + + // Get bind group layout from layer pipeline + WGPUBindGroupLayout layer_bgl = wgpuComputePipelineGetBindGroupLayout(layer_pipeline_, 0); + + // Create bind group for each layer + for (size_t i = 0; i < layer_info_.size(); ++i) { + WGPUBindGroupEntry layer_entries[5] = {}; + + // Binding 0: Static features (constant) + layer_entries[0].binding = 0; + layer_entries[0].textureView = static_features_view_; + + // Binding 1: Layer input (ping-pong: use previous layer's output) + // First layer uses static features as input, others use ping-pong buffers + layer_entries[1].binding = 1; + layer_entries[1].textureView = (i == 0) ? static_features_view_ : layer_views_[i % 2]; + + // Binding 2: Output texture (ping-pong) + layer_entries[2].binding = 2; + layer_entries[2].textureView = layer_views_[(i + 1) % 2]; + + // Binding 3: Weights buffer (constant) + layer_entries[3].binding = 3; + layer_entries[3].buffer = weights_buffer_; + layer_entries[3].size = wgpuBufferGetSize(weights_buffer_); + + // Binding 4: Layer params (will be updated per dispatch) + layer_entries[4].binding = 4; + layer_entries[4].buffer = layer_params_buffer_; + layer_entries[4].size = sizeof(LayerParams); + + WGPUBindGroupDescriptor layer_bg_desc = {}; + layer_bg_desc.layout = layer_bgl; + layer_bg_desc.entryCount = 5; + layer_bg_desc.entries = layer_entries; + + WGPUBindGroup layer_bg = wgpuDeviceCreateBindGroup(ctx_.device, &layer_bg_desc); + layer_bind_groups_.push_back(layer_bg); + } + + wgpuBindGroupLayoutRelease(layer_bgl); +} + +void CNNv2Effect::compute(WGPUCommandEncoder encoder, + const CommonPostProcessUniforms& uniforms) { + (void)uniforms; + if (!initialized_ || !static_pipeline_ || !static_bind_group_) return; + + // Pass 1: Compute static features + WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, nullptr); + + wgpuComputePassEncoderSetPipeline(pass, static_pipeline_); + wgpuComputePassEncoderSetBindGroup(pass, 0, static_bind_group_, 0, nullptr); + + // Dispatch workgroups (8×8 threads per group) + uint32_t workgroups_x = (width_ + 7) / 8; + uint32_t workgroups_y = (height_ + 7) / 8; + wgpuComputePassEncoderDispatchWorkgroups(pass, workgroups_x, workgroups_y, 1); + + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); + + // Execute CNN layer passes + if (!layer_pipeline_ || layer_bind_groups_.empty()) return; + + for (size_t i = 0; i < layer_info_.size(); ++i) { + const LayerInfo& info = layer_info_[i]; + + // Update layer params uniform buffer + LayerParams params; + params.kernel_size = info.kernel_size; + params.in_channels = info.in_channels; + params.out_channels = info.out_channels; + params.weight_offset = info.weight_offset; + params.is_output_layer = (i == layer_info_.size() - 1) ? 1 : 0; + + wgpuQueueWriteBuffer(ctx_.queue, layer_params_buffer_, 0, ¶ms, sizeof(params)); + + // Execute layer compute pass + WGPUComputePassEncoder layer_pass = wgpuCommandEncoderBeginComputePass(encoder, nullptr); + + wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline_); + wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bind_groups_[i], 0, nullptr); + + wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x, workgroups_y, 1); + + wgpuComputePassEncoderEnd(layer_pass); + wgpuComputePassEncoderRelease(layer_pass); + } +} + +void CNNv2Effect::render(WGPURenderPassEncoder pass, + const CommonPostProcessUniforms& uniforms) { + (void)pass; + (void)uniforms; + // Compute-only effect, rendering is done by default composite pass +} + +void CNNv2Effect::cleanup() { + if (static_features_view_) wgpuTextureViewRelease(static_features_view_); + if (static_features_tex_) wgpuTextureRelease(static_features_tex_); + if (static_bind_group_) wgpuBindGroupRelease(static_bind_group_); + if (static_pipeline_) wgpuComputePipelineRelease(static_pipeline_); + + if (layer_pipeline_) wgpuComputePipelineRelease(layer_pipeline_); + if (weights_buffer_) wgpuBufferRelease(weights_buffer_); + if (layer_params_buffer_) wgpuBufferRelease(layer_params_buffer_); + + for (int i = 0; i < 3; ++i) { + if (input_mip_view_[i]) wgpuTextureViewRelease(input_mip_view_[i]); + } + if (input_mip_tex_) wgpuTextureRelease(input_mip_tex_); + + for (auto view : layer_views_) wgpuTextureViewRelease(view); + for (auto tex : layer_textures_) wgpuTextureRelease(tex); + for (auto bg : layer_bind_groups_) wgpuBindGroupRelease(bg); + + layer_views_.clear(); + layer_textures_.clear(); + layer_bind_groups_.clear(); + layer_info_.clear(); + + initialized_ = false; +} |
