// CNN v3 Effect — U-Net + FiLM inference pass
// Runs 5 compute passes (enc0→enc1→bottleneck→dec1→dec0) on G-buffer feature
// textures produced by GBufferEffect.
//
// Architecture: enc_channels=[8,16]
//   enc0:       Conv(20→8,  3×3) + FiLM8  + ReLU       H×W      rgba32uint
//   enc1:       Conv(8→16,  3×3) + FiLM16 + ReLU       H/2×W/2  2× rgba32uint
//   bottleneck: Conv(16→16, 3×3, dil=2)   + ReLU       H/4×W/4  2× rgba32uint
//   dec1:       Conv(32→8,  3×3) + FiLM8  + ReLU       H/2×W/2  rgba32uint
//   dec0:       Conv(16→4,  3×3) + FiLM4  + ReLU + sig H×W      rgba16float
//
// Inputs:  feat_tex0, feat_tex1   (rgba32uint, 20-channel G-buffer)
// Output:  output_tex             (rgba16float, 4-channel RGBA)

#pragma once

#include <cstdint>

#include "gpu/effect.h"
#include "gpu/sequence.h"
#include "gpu/uniform_helper.h"
#include "gpu/wgpu_resource.h"

// ---------------------------------------------------------------------------
// Per-pass params uniform layouts (mirror WGSL Params structs exactly)
// ---------------------------------------------------------------------------

// enc0, dec1: 8-channel FiLM (lo/hi vec4 split)
//
// WGSL layout:
//   offset  0: weight_offset (u32)
//   offset  4-15: implicit pad, vec3u aligned to 16
//   offset 16: _pad (vec3u, 12 bytes)
//   offset 28-31: implicit pad
//   offset 32: gamma_lo (vec4f)
//   offset 48: gamma_hi (vec4f)
//   offset 64: beta_lo  (vec4f)
//   offset 80: beta_hi  (vec4f)
//   total: 96 bytes
struct CnnV3Params8ch {
  uint32_t weight_offset;  // offset 0
  uint32_t _pad[7];        // offsets 4-31
  float gamma_lo[4];       // offset 32
  float gamma_hi[4];       // offset 48
  float beta_lo[4];        // offset 64
  float beta_hi[4];        // offset 80
};
static_assert(sizeof(CnnV3Params8ch) == 96, "CnnV3Params8ch must be 96 bytes");

// enc1: 16-channel FiLM (four vec4 groups for gamma + four for beta)
//
// WGSL layout:
//   offset  0: weight_offset (u32)
//   offset 16: _pad (vec3u)
//   offset 32: gamma_0..3 (4x vec4f = 64 bytes)
//   offset 96: beta_0..3  (4x vec4f = 64 bytes)
//   total: 160 bytes
struct CnnV3Params16ch {
  uint32_t weight_offset;  // offset 0
  uint32_t _pad[7];        // offsets 4-31
  float gamma[16];         // offsets 32-95
  float beta[16];          // offsets 96-159
};
static_assert(sizeof(CnnV3Params16ch) == 160, "CnnV3Params16ch must be 160 bytes");

// dec0: 4-channel FiLM
//
// WGSL layout:
//   offset  0: weight_offset (u32)
//   offset 16: _pad (vec3u)
//   offset 32: gamma (vec4f)
//   offset 48: beta  (vec4f)
//   total: 64 bytes
struct CnnV3Params4ch {
  uint32_t weight_offset;  // offset 0
  uint32_t _pad[7];        // offsets 4-31
  float gamma[4];          // offset 32
  float beta[4];           // offset 48
};
static_assert(sizeof(CnnV3Params4ch) == 64, "CnnV3Params4ch must be 64 bytes");

// bottleneck: no FiLM — weight_offset + 3 pads
struct CnnV3ParamsBn {
  uint32_t weight_offset;
  uint32_t _pad[3];
};
static_assert(sizeof(CnnV3ParamsBn) == 16, "CnnV3ParamsBn must be 16 bytes");

// ---------------------------------------------------------------------------
// FiLM conditioning inputs (CPU-side, uploaded via set_film_params each frame)
// ---------------------------------------------------------------------------
struct CNNv3FiLMParams {
  float beat_phase      = 0.0f;  // 0-1 within current beat
  float beat_norm       = 0.0f;  // beat_time / 8.0, normalized 8-beat cycle
  float audio_intensity = 0.0f;  // peak audio level 0-1
  float style_p0        = 0.0f;  // user-defined style param
  float style_p1        = 0.0f;  // user-defined style param
};

class CNNv3Effect : public Effect {
 public:
  CNNv3Effect(const GpuContext& ctx, const std::vector<std::string>& inputs,
              const std::vector<std::string>& outputs, float start_time,
              float end_time);

  void declare_nodes(NodeRegistry& registry) override;

  void render(WGPUCommandEncoder encoder, const UniformsSequenceParams& params,
              NodeRegistry& nodes) override;

  // Update FiLM conditioning; call before render() each frame.
  void set_film_params(const CNNv3FiLMParams& fp);

  // Upload packed-f16 weights (kWeightsBufBytes bytes of u32 pairs).
  void upload_weights(WGPUQueue queue, const void* data, uint32_t size_bytes);

 private:
  // Intermediate node names (prefixed from output[0])
  std::string node_enc0_;
  std::string node_enc1_lo_;
  std::string node_enc1_hi_;
  std::string node_bn_lo_;
  std::string node_bn_hi_;
  std::string node_dec1_;

  // 5 compute pipelines
  ComputePipeline enc0_pipeline_;
  ComputePipeline enc1_pipeline_;
  ComputePipeline bn_pipeline_;
  ComputePipeline dec1_pipeline_;
  ComputePipeline dec0_pipeline_;

  // 5 bind groups (rebuilt each render since node views may change)
  BindGroup enc0_bg_;
  BindGroup enc1_bg_;
  BindGroup bn_bg_;
  BindGroup dec1_bg_;
  BindGroup dec0_bg_;

  // Params uniform buffers (one per pass)
  UniformBuffer<CnnV3Params8ch>  enc0_params_buf_;
  UniformBuffer<CnnV3Params16ch> enc1_params_buf_;
  UniformBuffer<CnnV3ParamsBn>   bn_params_buf_;
  UniformBuffer<CnnV3Params8ch>  dec1_params_buf_;
  UniformBuffer<CnnV3Params4ch>  dec0_params_buf_;

  // Shared packed-f16 weights (storage buffer, read-only in all shaders)
  GpuBuffer weights_buf_;

  // Per-pass params shadow (updated by set_film_params, uploaded in render)
  CnnV3Params8ch  enc0_params_{};
  CnnV3Params16ch enc1_params_{};
  CnnV3ParamsBn   bn_params_{};
  CnnV3Params8ch  dec1_params_{};
  CnnV3Params4ch  dec0_params_{};

  void create_pipelines();
  void update_bind_groups(NodeRegistry& nodes);
};