1 files changed, 497 insertions, 0 deletions
diff --git a/cnn_v2/src/cnn_v2_effect.cc b/cnn_v2/src/cnn_v2_effect.cc
new file mode 100644
index 0000000..60538d4
--- /dev/null
+++ b/cnn_v2/src/cnn_v2_effect.cc
@@ -0,0 +1,497 @@
+// CNN v2 Effect Implementation
+
+#include "cnn_v2_effect.h"
+
+#if defined(USE_TEST_ASSETS)
+#include "test_assets.h"
+#else
+#include "generated/assets.h"
+#endif
+
+#include "gpu/bind_group_builder.h"
+#include "gpu/gpu.h"
+#include "util/asset_manager.h"
+#include "util/fatal_error.h"
+#include <cstring>
+
+CNNv2Effect::CNNv2Effect(const GpuContext& ctx)
+    : PostProcessEffect(ctx), static_pipeline_(nullptr),
+      static_bind_group_(nullptr), static_params_buffer_(nullptr),
+      static_features_tex_(nullptr), static_features_view_(nullptr),
+      linear_sampler_(nullptr), layer_pipeline_(nullptr),
+      weights_buffer_(nullptr), input_mip_tex_(nullptr),
+      current_input_view_(nullptr), blend_amount_(1.0f), mip_level_(0),
+      initialized_(false) {
+  std::memset(input_mip_view_, 0, sizeof(input_mip_view_));
+}
+
+CNNv2Effect::CNNv2Effect(const GpuContext& ctx, const CNNv2EffectParams& params)
+    : PostProcessEffect(ctx), static_pipeline_(nullptr),
+      static_bind_group_(nullptr), static_params_buffer_(nullptr),
+      static_features_tex_(nullptr), static_features_view_(nullptr),
+      linear_sampler_(nullptr), layer_pipeline_(nullptr),
+      weights_buffer_(nullptr), input_mip_tex_(nullptr),
+      current_input_view_(nullptr), blend_amount_(params.blend_amount),
+      mip_level_(0), initialized_(false) {
+  std::memset(input_mip_view_, 0, sizeof(input_mip_view_));
+}
+
+CNNv2Effect::~CNNv2Effect() {
+  cleanup();
+}
+
+void CNNv2Effect::init(MainSequence* demo) {
+  (void)demo;
+  if (initialized_)
+    return;
+
+  load_weights();
+  create_textures();
+  create_pipelines();
+
+  initialized_ = true;
+}
+
+void CNNv2Effect::resize(int width, int height) {
+  PostProcessEffect::resize(width, height);
+  cleanup();
+  create_textures();
+  create_pipelines();
+}
+
+void CNNv2Effect::load_weights() {
+  // Load binary weights asset
+  size_t weights_size = 0;
+  const uint8_t* weights_data =
+      (const uint8_t*)GetAsset(AssetId::ASSET_WEIGHTS_CNN_V2, &weights_size);
+
+  if (!weights_data || weights_size < 20) {
+    // Weights not available - effect will skip
+    return;
+  }
+
+  // Parse header
+  const uint32_t* header = (const uint32_t*)weights_data;
+  uint32_t magic = header[0];
+  uint32_t version = header[1];
+  uint32_t num_layers = header[2];
+  uint32_t total_weights = header[3];
+
+  FATAL_CHECK(magic != 0x324e4e43, "Invalid CNN v2 weights magic\n"); // 'CNN2'
+
+  // Support both version 1 (16-byte header) and version 2 (20-byte header with
+  // mip_level)
+  // TODO: Version 3 should include feature descriptor for arbitrary
+  // layout/ordering
+  if (version == 1) {
+    mip_level_ = 0; // Default for v1
+  } else if (version == 2) {
+    mip_level_ = header[4];
+  } else {
+    FATAL_ERROR("Unsupported CNN v2 weights version: %u\n", version);
+  }
+
+  // Parse layer info (20 bytes per layer)
+  // Offset depends on version: v1=16 bytes (4 u32), v2=20 bytes (5 u32)
+  const uint32_t header_u32_count = (version == 1) ? 4 : 5;
+  const uint32_t* layer_data = header + header_u32_count;
+  for (uint32_t i = 0; i < num_layers; ++i) {
+    LayerInfo info;
+    info.kernel_size = layer_data[i * 5 + 0];
+    info.in_channels = layer_data[i * 5 + 1];
+    info.out_channels = layer_data[i * 5 + 2];
+    info.weight_offset = layer_data[i * 5 + 3];
+    info.weight_count = layer_data[i * 5 + 4];
+    layer_info_.push_back(info);
+  }
+
+  // Create GPU storage buffer for weights (skip header + layer info, upload
+  // only weights)
+  size_t header_size = 20;                  // 5 u32
+  size_t layer_info_size = 20 * num_layers; // 5 u32 per layer
+  size_t weights_offset = header_size + layer_info_size;
+  size_t weights_only_size = weights_size - weights_offset;
+
+  WGPUBufferDescriptor buffer_desc = {};
+  buffer_desc.size = weights_only_size;
+  buffer_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst;
+  buffer_desc.mappedAtCreation = false;
+
+  weights_buffer_ = wgpuDeviceCreateBuffer(ctx_.device, &buffer_desc);
+
+  // Upload only weights (skip header + layer info)
+  wgpuQueueWriteBuffer(ctx_.queue, weights_buffer_, 0,
+                       weights_data + weights_offset, weights_only_size);
+
+  // Create uniform buffers for layer params (one per layer)
+  for (uint32_t i = 0; i < num_layers; ++i) {
+    WGPUBufferDescriptor params_desc = {};
+    params_desc.size = sizeof(LayerParams);
+    params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+    params_desc.mappedAtCreation = false;
+
+    WGPUBuffer buf = wgpuDeviceCreateBuffer(ctx_.device, &params_desc);
+    layer_params_buffers_.push_back(buf);
+  }
+}
+
+void CNNv2Effect::create_textures() {
+  // Static features texture (8×f16 packed as 4×u32)
+  TextureWithView static_tex = gpu_create_storage_texture_2d(
+      ctx_.device, width_, height_, WGPUTextureFormat_RGBA32Uint);
+  static_features_tex_ = static_tex.texture;
+  static_features_view_ = static_tex.view;
+
+  // Input texture with mips (for multi-scale features)
+  TextureWithView input_mip = gpu_create_texture_2d(
+      ctx_.device, width_, height_, WGPUTextureFormat_RGBA8Unorm,
+      (WGPUTextureUsage)(WGPUTextureUsage_TextureBinding |
+                         WGPUTextureUsage_CopyDst),
+      3);
+  input_mip_tex_ = input_mip.texture;
+
+  for (int i = 0; i < 3; ++i) {
+    input_mip_view_[i] =
+        gpu_create_mip_view(input_mip_tex_, WGPUTextureFormat_RGBA8Unorm, i);
+  }
+
+  // Create 2 layer textures (ping-pong buffers for intermediate results)
+  // Each stores 8×f16 channels packed as 4×u32
+  for (int i = 0; i < 2; ++i) {
+    TextureWithView layer = gpu_create_storage_texture_2d(
+        ctx_.device, width_, height_, WGPUTextureFormat_RGBA32Uint);
+    layer_textures_.push_back(layer.texture);
+    layer_views_.push_back(layer.view);
+  }
+
+  // Create uniform buffer for static feature params
+  WGPUBufferDescriptor params_desc = {};
+  params_desc.size = sizeof(StaticFeatureParams);
+  params_desc.usage = WGPUBufferUsage_Uniform | WGPUBufferUsage_CopyDst;
+  params_desc.mappedAtCreation = false;
+  static_params_buffer_ = wgpuDeviceCreateBuffer(ctx_.device, &params_desc);
+}
+
+void CNNv2Effect::create_pipelines() {
+  // Create linear sampler for bilinear interpolation
+  WGPUSamplerDescriptor sampler_desc = {};
+  sampler_desc.addressModeU = WGPUAddressMode_ClampToEdge;
+  sampler_desc.addressModeV = WGPUAddressMode_ClampToEdge;
+  sampler_desc.addressModeW = WGPUAddressMode_ClampToEdge;
+  sampler_desc.magFilter = WGPUFilterMode_Linear;
+  sampler_desc.minFilter = WGPUFilterMode_Linear;
+  sampler_desc.mipmapFilter = WGPUMipmapFilterMode_Linear;
+  sampler_desc.lodMinClamp = 0.0f;
+  sampler_desc.lodMaxClamp = 32.0f;
+  sampler_desc.maxAnisotropy = 1;
+
+  linear_sampler_ = wgpuDeviceCreateSampler(ctx_.device, &sampler_desc);
+
+  // Static features compute pipeline
+  size_t shader_size = 0;
+  const char* static_code =
+      (const char*)GetAsset(AssetId::ASSET_SHADER_CNN_V2_STATIC, &shader_size);
+
+  if (!static_code || shader_size == 0) {
+    // Shader not available (e.g., in test mode) - skip pipeline creation
+    return;
+  }
+
+  WGPUShaderSourceWGSL wgsl_src = {};
+  wgsl_src.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_src.code = str_view(static_code);
+
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_src.chain;
+
+  // Create bind group layout for static features compute
+  // Bindings: 0=input_tex, 1=input_mip1, 2=input_mip2, 3=depth_tex, 4=output,
+  // 5=params, 6=linear_sampler
+  WGPUBindGroupLayout static_bgl =
+      BindGroupLayoutBuilder()
+          .texture(0, WGPUShaderStage_Compute)
+          .texture(1, WGPUShaderStage_Compute)
+          .texture(2, WGPUShaderStage_Compute)
+          .texture(3, WGPUShaderStage_Compute)
+          .storage_texture(4, WGPUShaderStage_Compute,
+                           WGPUTextureFormat_RGBA32Uint)
+          .uniform(5, WGPUShaderStage_Compute, sizeof(StaticFeatureParams))
+          .sampler(6, WGPUShaderStage_Compute)
+          .build(ctx_.device);
+
+  // Update pipeline layout
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &static_bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(ctx_.device, &pl_desc);
+
+  // Recreate pipeline with proper layout
+  WGPUComputePipelineDescriptor pipeline_desc2 = {};
+  pipeline_desc2.compute.module =
+      wgpuDeviceCreateShaderModule(ctx_.device, &shader_desc);
+  pipeline_desc2.compute.entryPoint = str_view("main");
+  pipeline_desc2.layout = pipeline_layout;
+
+  if (static_pipeline_)
+    wgpuComputePipelineRelease(static_pipeline_);
+  static_pipeline_ =
+      wgpuDeviceCreateComputePipeline(ctx_.device, &pipeline_desc2);
+
+  wgpuShaderModuleRelease(pipeline_desc2.compute.module);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  wgpuBindGroupLayoutRelease(static_bgl);
+
+  // CNN layer compute pipeline (storage buffer version)
+  if (layer_info_.empty())
+    return; // No weights loaded
+
+  size_t layer_shader_size = 0;
+  const char* layer_code = (const char*)GetAsset(
+      AssetId::ASSET_SHADER_CNN_V2_COMPUTE, &layer_shader_size);
+
+  if (!layer_code || layer_shader_size == 0)
+    return;
+
+  WGPUShaderSourceWGSL layer_wgsl = {};
+  layer_wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+  layer_wgsl.code = str_view(layer_code);
+
+  WGPUShaderModuleDescriptor layer_shader_desc = {};
+  layer_shader_desc.nextInChain = &layer_wgsl.chain;
+
+  WGPUShaderModule layer_module =
+      wgpuDeviceCreateShaderModule(ctx_.device, &layer_shader_desc);
+  if (!layer_module)
+    return;
+
+  // Create bind group layout for layer compute
+  // 0=static_features, 1=layer_input, 2=output, 3=weights, 4=params,
+  // 5=original_input
+  WGPUBindGroupLayout layer_bgl =
+      BindGroupLayoutBuilder()
+          .uint_texture(0, WGPUShaderStage_Compute)
+          .uint_texture(1, WGPUShaderStage_Compute)
+          .storage_texture(2, WGPUShaderStage_Compute,
+                           WGPUTextureFormat_RGBA32Uint)
+          .storage(3, WGPUShaderStage_Compute)
+          .uniform(4, WGPUShaderStage_Compute, sizeof(LayerParams))
+          .texture(5, WGPUShaderStage_Compute)
+          .build(ctx_.device);
+
+  WGPUPipelineLayoutDescriptor layer_pl_desc = {};
+  layer_pl_desc.bindGroupLayoutCount = 1;
+  layer_pl_desc.bindGroupLayouts = &layer_bgl;
+
+  WGPUPipelineLayout layer_pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(ctx_.device, &layer_pl_desc);
+
+  WGPUComputePipelineDescriptor layer_pipeline_desc = {};
+  layer_pipeline_desc.compute.module = layer_module;
+  layer_pipeline_desc.compute.entryPoint = str_view("main");
+  layer_pipeline_desc.layout = layer_pipeline_layout;
+
+  layer_pipeline_ =
+      wgpuDeviceCreateComputePipeline(ctx_.device, &layer_pipeline_desc);
+
+  wgpuShaderModuleRelease(layer_module);
+  wgpuPipelineLayoutRelease(layer_pipeline_layout);
+  wgpuBindGroupLayoutRelease(layer_bgl);
+}
+
+void CNNv2Effect::update_bind_group(WGPUTextureView input_view) {
+  if (!static_pipeline_)
+    return;
+
+  // Cache input view
+  current_input_view_ = input_view;
+
+  // Release old bind group
+  if (static_bind_group_) {
+    wgpuBindGroupRelease(static_bind_group_);
+    static_bind_group_ = nullptr;
+  }
+
+  // Create bind group for static features compute (manual for storage texture
+  // binding)
+  WGPUBindGroupEntry bg_entries[7] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].textureView = input_view;
+  bg_entries[1].binding = 1;
+  bg_entries[1].textureView = input_mip_view_[0];
+  bg_entries[2].binding = 2;
+  bg_entries[2].textureView =
+      input_mip_view_[1] ? input_mip_view_[1] : input_mip_view_[0];
+  bg_entries[3].binding = 3;
+  bg_entries[3].textureView = input_view;
+  bg_entries[4].binding = 4;
+  bg_entries[4].textureView = static_features_view_;
+  bg_entries[5].binding = 5;
+  bg_entries[5].buffer = static_params_buffer_;
+  bg_entries[5].size = sizeof(StaticFeatureParams);
+  bg_entries[6].binding = 6;
+  bg_entries[6].sampler = linear_sampler_;
+
+  WGPUBindGroupLayout layout =
+      wgpuComputePipelineGetBindGroupLayout(static_pipeline_, 0);
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = layout;
+  bg_desc.entryCount = 7;
+  bg_desc.entries = bg_entries;
+  static_bind_group_ = wgpuDeviceCreateBindGroup(ctx_.device, &bg_desc);
+  wgpuBindGroupLayoutRelease(layout);
+
+  // Create layer bind groups
+  if (!layer_pipeline_ || layer_info_.empty())
+    return;
+
+  // Release old layer bind groups
+  for (auto bg : layer_bind_groups_) {
+    wgpuBindGroupRelease(bg);
+  }
+  layer_bind_groups_.clear();
+
+  // Get bind group layout from layer pipeline
+  WGPUBindGroupLayout layer_bgl =
+      wgpuComputePipelineGetBindGroupLayout(layer_pipeline_, 0);
+
+  // Create bind group for each layer
+  for (size_t i = 0; i < layer_info_.size(); ++i) {
+    WGPUTextureView layer_input =
+        (i == 0) ? static_features_view_ : layer_views_[i % 2];
+
+    WGPUBindGroup layer_bg =
+        BindGroupBuilder()
+            .texture(0, static_features_view_)
+            .texture(1, layer_input)
+            .texture(2, layer_views_[(i + 1) % 2])
+            .buffer(3, weights_buffer_, wgpuBufferGetSize(weights_buffer_))
+            .buffer(4, layer_params_buffers_[i], sizeof(LayerParams))
+            .texture(5, input_view)
+            .build(ctx_.device, layer_bgl);
+
+    layer_bind_groups_.push_back(layer_bg);
+  }
+
+  wgpuBindGroupLayoutRelease(layer_bgl);
+}
+
+void CNNv2Effect::compute(WGPUCommandEncoder encoder,
+                          const CommonPostProcessUniforms& uniforms) {
+  if (!initialized_ || !static_pipeline_ || !static_bind_group_)
+    return;
+
+  float effective_blend = blend_amount_;
+  if (beat_modulated_) {
+    effective_blend = blend_amount_ * uniforms.beat_phase * beat_scale_;
+  }
+
+  // Update static feature params
+  StaticFeatureParams static_params;
+  static_params.mip_level = mip_level_;
+  static_params.padding[0] = 0;
+  static_params.padding[1] = 0;
+  static_params.padding[2] = 0;
+  wgpuQueueWriteBuffer(ctx_.queue, static_params_buffer_, 0, &static_params,
+                       sizeof(static_params));
+
+  // Pass 1: Compute static features
+  WGPUComputePassEncoder pass =
+      wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+
+  wgpuComputePassEncoderSetPipeline(pass, static_pipeline_);
+  wgpuComputePassEncoderSetBindGroup(pass, 0, static_bind_group_, 0, nullptr);
+
+  // Dispatch workgroups (8×8 threads per group)
+  uint32_t workgroups_x = (width_ + 7) / 8;
+  uint32_t workgroups_y = (height_ + 7) / 8;
+  wgpuComputePassEncoderDispatchWorkgroups(pass, workgroups_x, workgroups_y, 1);
+
+  wgpuComputePassEncoderEnd(pass);
+  wgpuComputePassEncoderRelease(pass);
+
+  // Execute CNN layer passes
+  if (!layer_pipeline_ || layer_bind_groups_.empty())
+    return;
+
+  // Update layer params (each layer has own buffer)
+  for (size_t i = 0; i < layer_info_.size(); ++i) {
+    const LayerInfo& info = layer_info_[i];
+
+    LayerParams params;
+    params.kernel_size = info.kernel_size;
+    params.in_channels = info.in_channels;
+    params.out_channels = info.out_channels;
+    params.weight_offset = info.weight_offset;
+    params.is_output_layer = (i == layer_info_.size() - 1) ? 1 : 0;
+    params.blend_amount = effective_blend;
+    params.is_layer_0 = (i == 0) ? 1 : 0;
+
+    wgpuQueueWriteBuffer(ctx_.queue, layer_params_buffers_[i], 0, &params,
+                         sizeof(params));
+
+    WGPUComputePassEncoder layer_pass =
+        wgpuCommandEncoderBeginComputePass(encoder, nullptr);
+
+    wgpuComputePassEncoderSetPipeline(layer_pass, layer_pipeline_);
+    wgpuComputePassEncoderSetBindGroup(layer_pass, 0, layer_bind_groups_[i], 0,
+                                       nullptr);
+
+    wgpuComputePassEncoderDispatchWorkgroups(layer_pass, workgroups_x,
+                                             workgroups_y, 1);
+
+    wgpuComputePassEncoderEnd(layer_pass);
+    wgpuComputePassEncoderRelease(layer_pass);
+  }
+}
+
+void CNNv2Effect::render(WGPURenderPassEncoder pass,
+                         const CommonPostProcessUniforms& uniforms) {
+  (void)pass;
+  (void)uniforms;
+  // Compute-only effect, rendering is done by default composite pass
+}
+
+void CNNv2Effect::cleanup() {
+  if (static_features_view_)
+    wgpuTextureViewRelease(static_features_view_);
+  if (static_features_tex_)
+    wgpuTextureRelease(static_features_tex_);
+  if (static_bind_group_)
+    wgpuBindGroupRelease(static_bind_group_);
+  if (static_params_buffer_)
+    wgpuBufferRelease(static_params_buffer_);
+  if (static_pipeline_)
+    wgpuComputePipelineRelease(static_pipeline_);
+  if (linear_sampler_)
+    wgpuSamplerRelease(linear_sampler_);
+
+  if (layer_pipeline_)
+    wgpuComputePipelineRelease(layer_pipeline_);
+  if (weights_buffer_)
+    wgpuBufferRelease(weights_buffer_);
+  for (auto buf : layer_params_buffers_)
+    wgpuBufferRelease(buf);
+  layer_params_buffers_.clear();
+
+  for (int i = 0; i < 3; ++i) {
+    if (input_mip_view_[i])
+      wgpuTextureViewRelease(input_mip_view_[i]);
+  }
+  if (input_mip_tex_)
+    wgpuTextureRelease(input_mip_tex_);
+
+  for (auto view : layer_views_)
+    wgpuTextureViewRelease(view);
+  for (auto tex : layer_textures_)
+    wgpuTextureRelease(tex);
+  for (auto bg : layer_bind_groups_)
+    wgpuBindGroupRelease(bg);
+
+  layer_views_.clear();
+  layer_textures_.clear();
+  layer_bind_groups_.clear();
+  layer_info_.clear();
+
+  initialized_ = false;
+}