summaryrefslogtreecommitdiff
path: root/cnn_v3/src
diff options
context:
space:
mode:
authorskal <pascal.massimino@gmail.com>2026-03-27 07:59:00 +0100
committerskal <pascal.massimino@gmail.com>2026-03-27 07:59:00 +0100
commitfb13e67acbc7d7dd2974a456fcb134966c47cee0 (patch)
tree8dd1c6df371b0ee046792680a14c8bcb3c36510b /cnn_v3/src
parent8c5e41724fdfc3be24e95f48ae4b2be616404074 (diff)
fix(cnn_v3): remove dec0 ReLU, load FiLM MLP at runtime
Two bugs blocking training convergence: 1. dec0 ReLU before sigmoid constrained output to [0.5,1.0] — network could never produce dark pixels. Removed F.relu in train_cnn_v3.py and max(0,…) in cnn_v3_dec0.wgsl. Test vectors regenerated. 2. set_film_params() used hardcoded heuristics instead of the trained MLP. Added CNNv3FilmMlp struct + load_film_mlp() to cnn_v3_effect.h/.cc. MLP auto-loaded from ASSET_WEIGHTS_CNN_V3_FILM_MLP at construction; Linear(5→16)→ReLU→Linear(16→72) runs CPU-side each frame. 36/36 tests pass. Parity max_err=4.88e-4 unchanged. handoff(Gemini): retrain from scratch — needs ≥50 samples (currently 11). See cnn_v3/docs/HOWTO.md §2-3.
Diffstat (limited to 'cnn_v3/src')
-rw-r--r--cnn_v3/src/cnn_v3_effect.cc76
-rw-r--r--cnn_v3/src/cnn_v3_effect.h22
2 files changed, 81 insertions, 17 deletions
diff --git a/cnn_v3/src/cnn_v3_effect.cc b/cnn_v3/src/cnn_v3_effect.cc
index dc26751..e576ceb 100644
--- a/cnn_v3/src/cnn_v3_effect.cc
+++ b/cnn_v3/src/cnn_v3_effect.cc
@@ -187,6 +187,13 @@ CNNv3Effect::CNNv3Effect(const GpuContext& ctx,
if (weights_data && weights_size == kWeightsBufBytes) {
upload_weights(ctx_.queue, weights_data, (uint32_t)weights_size);
}
+
+ size_t mlp_size = 0;
+ const void* mlp_data =
+ GetAsset(AssetId::ASSET_WEIGHTS_CNN_V3_FILM_MLP, &mlp_size);
+ if (mlp_data) {
+ load_film_mlp(mlp_data, (uint32_t)mlp_size);
+ }
}
// ---------------------------------------------------------------------------
@@ -219,28 +226,67 @@ void CNNv3Effect::upload_weights(WGPUQueue queue, const void* data,
wgpuQueueWriteBuffer(queue, weights_buf_.buffer, 0, data, size_bytes);
}
+void CNNv3Effect::load_film_mlp(const void* data, uint32_t size_bytes) {
+ if (size_bytes != sizeof(CNNv3FilmMlp)) return;
+ memcpy(&mlp_, data, sizeof(CNNv3FilmMlp));
+ mlp_loaded_ = true;
+}
+
void CNNv3Effect::set_film_params(const CNNv3FiLMParams& fp) {
- const float a = fp.audio_intensity;
- const float b = fp.beat_phase;
+ if (!mlp_loaded_) {
+ // Identity FiLM (γ=1, β=0) — no learned conditioning available.
+ return;
+ }
+
+ // cond[5] = {beat_phase, beat_norm, audio_intensity, style_p0, style_p1}
+ const float cond[5] = {fp.beat_phase, fp.beat_norm, fp.audio_intensity,
+ fp.style_p0, fp.style_p1};
+ // Layer 0: Linear(5→16) + ReLU
+ float h[16];
+ for (int j = 0; j < 16; ++j) {
+ float s = mlp_.l0_b[j];
+ for (int i = 0; i < 5; ++i) s += mlp_.l0_w[j * 5 + i] * cond[i];
+ h[j] = s > 0.f ? s : 0.f;
+ }
+
+ // Layer 1: Linear(16→72)
+ // Output split: g_enc0(8)|b_enc0(8)|g_enc1(16)|b_enc1(16)|g_dec1(8)|b_dec1(8)|g_dec0(4)|b_dec0(4)
+ float film[72];
+ for (int j = 0; j < 72; ++j) {
+ float s = mlp_.l1_b[j];
+ for (int i = 0; i < 16; ++i) s += mlp_.l1_w[j * 16 + i] * h[i];
+ film[j] = s;
+ }
+
+ const float* p = film;
for (int i = 0; i < 4; ++i) {
- enc0_params_.gamma_lo[i] = 1.0f + a * 0.5f;
- enc0_params_.gamma_hi[i] = 1.0f + a * 0.5f;
- enc0_params_.beta_lo[i] = b * 0.1f;
- enc0_params_.beta_hi[i] = b * 0.1f;
+ enc0_params_.gamma_lo[i] = p[i];
+ enc0_params_.gamma_hi[i] = p[i + 4];
}
- for (int i = 0; i < 16; ++i) {
- enc1_params_.gamma[i] = 1.0f + a * 0.3f;
- enc1_params_.beta[i] = fp.beat_norm * 0.1f;
+ p += 8;
+ for (int i = 0; i < 4; ++i) {
+ enc0_params_.beta_lo[i] = p[i];
+ enc0_params_.beta_hi[i] = p[i + 4];
+ }
+ p += 8;
+ for (int i = 0; i < 16; ++i) enc1_params_.gamma[i] = p[i];
+ p += 16;
+ for (int i = 0; i < 16; ++i) enc1_params_.beta[i] = p[i];
+ p += 16;
+ for (int i = 0; i < 4; ++i) {
+ dec1_params_.gamma_lo[i] = p[i];
+ dec1_params_.gamma_hi[i] = p[i + 4];
}
+ p += 8;
for (int i = 0; i < 4; ++i) {
- dec1_params_.gamma_lo[i] = 1.0f + fp.style_p0 * 0.5f;
- dec1_params_.gamma_hi[i] = 1.0f + fp.style_p0 * 0.5f;
- dec1_params_.beta_lo[i] = fp.style_p1 * 0.1f;
- dec1_params_.beta_hi[i] = fp.style_p1 * 0.1f;
- dec0_params_.gamma[i] = 1.0f + fp.style_p0 * 0.5f;
- dec0_params_.beta[i] = fp.style_p1 * 0.1f;
+ dec1_params_.beta_lo[i] = p[i];
+ dec1_params_.beta_hi[i] = p[i + 4];
}
+ p += 8;
+ for (int i = 0; i < 4; ++i) dec0_params_.gamma[i] = p[i];
+ p += 4;
+ for (int i = 0; i < 4; ++i) dec0_params_.beta[i] = p[i];
}
// ---------------------------------------------------------------------------
diff --git a/cnn_v3/src/cnn_v3_effect.h b/cnn_v3/src/cnn_v3_effect.h
index 070f988..589680c 100644
--- a/cnn_v3/src/cnn_v3_effect.h
+++ b/cnn_v3/src/cnn_v3_effect.h
@@ -7,7 +7,7 @@
// enc1: Conv(8→16, 3×3) + FiLM16 + ReLU H/2×W/2 2× rgba32uint
// bottleneck: Conv(16→16, 3×3, dil=2) + ReLU H/4×W/4 2× rgba32uint
// dec1: Conv(32→8, 3×3) + FiLM8 + ReLU H/2×W/2 rgba32uint
-// dec0: Conv(16→4, 3×3) + FiLM4 + ReLU + sig H×W rgba16float
+// dec0: Conv(16→4, 3×3) + FiLM4 + sig H×W rgba16float
//
// Inputs: feat_tex0, feat_tex1 (rgba32uint, 20-channel G-buffer)
// Output: output_tex (rgba16float, 4-channel RGBA)
@@ -97,6 +97,17 @@ struct CNNv3FiLMParams {
float style_p1 = 0.0f; // user-defined style param
};
+// FiLM MLP weights: Linear(5→16)→ReLU→Linear(16→72).
+// Loaded from cnn_v3_film_mlp.bin (1320 f32 = 5280 bytes).
+// Layout: l0_w(80) | l0_b(16) | l1_w(1152) | l1_b(72), all row-major f32.
+struct CNNv3FilmMlp {
+ float l0_w[16 * 5]; // (16, 5) row-major
+ float l0_b[16];
+ float l1_w[72 * 16]; // (72, 16) row-major
+ float l1_b[72];
+};
+static_assert(sizeof(CNNv3FilmMlp) == 1320 * 4, "CNNv3FilmMlp size mismatch");
+
class CNNv3Effect : public Effect {
public:
CNNv3Effect(const GpuContext& ctx, const std::vector<std::string>& inputs,
@@ -111,9 +122,13 @@ class CNNv3Effect : public Effect {
// Update FiLM conditioning; call before render() each frame.
void set_film_params(const CNNv3FiLMParams& fp);
- // Upload packed-f16 weights (kWeightsBufBytes bytes of u32 pairs).
+ // Upload packed-f16 conv weights (kWeightsBufBytes bytes of u32 pairs).
void upload_weights(WGPUQueue queue, const void* data, uint32_t size_bytes);
+ // Load FiLM MLP weights from cnn_v3_film_mlp.bin (1320 f32 = 5280 bytes).
+ // Must be called before set_film_params() for learned conditioning.
+ void load_film_mlp(const void* data, uint32_t size_bytes);
+
private:
// Intermediate node names (prefixed from output[0])
std::string node_enc0_;
@@ -156,4 +171,7 @@ class CNNv3Effect : public Effect {
void create_pipelines();
void update_bind_groups(NodeRegistry& nodes);
+
+ CNNv3FilmMlp mlp_{};
+ bool mlp_loaded_ = false;
};