summaryrefslogtreecommitdiff
path: root/src/audio/gen.cc
diff options
context:
space:
mode:
authorskal <pascal.massimino@gmail.com>2026-02-06 18:41:54 +0100
committerskal <pascal.massimino@gmail.com>2026-02-06 18:41:54 +0100
commit02beca9eaddbc3bf60e121b1932d2eb4a41221b8 (patch)
treea00f7769284bf779b9deeeb0f1aa8f65b753c0f3 /src/audio/gen.cc
parent0911bfc9b014e02f1dd9d631f39c64a8b1717118 (diff)
fix(audio): Normalize procedurally generated notes to consistent RMS level
ISSUE: Generated NOTE_ samples were extremely loud and not normalized: - Peak: 9.994 (999% over limit - severe clipping) - RMS: 3.486 (23x louder than normalized asset samples) - User report: "NOTE_ is way too loud" ROOT CAUSE: generate_note_spectrogram() applied a fixed scale factor (6.4) without measuring actual output levels. This was a guess from commit f998bfc that didn't account for harmonic synthesis amplification. SOLUTION: Added post-generation normalization (matching spectool --normalize): 1. Generate spectrogram with existing algorithm 2. Synthesize PCM via IDCT to measure actual output 3. Calculate RMS and peak of synthesized audio 4. Scale spectrogram to target RMS (0.15, matching normalized assets) 5. Limit by peak to prevent clipping (max safe peak = 1.0) RESULTS: After normalization: - Peak: 0.430 (safe, no clipping) ✅ - RMS: 0.150 (exactly target) ✅ - Consistent with normalized asset samples (RMS 0.09-0.15 range) IMPROVEMENT: - Peak reduced by 23.3x (9.994 → 0.430) - RMS reduced by 23.2x (3.486 → 0.150) - Procedural notes now have same perceived loudness as assets COST: Small CPU overhead during note generation (one-time cost per unique note): - One full IDCT pass per note (31 frames × 512 samples) - Negligible for tracker system with caching (14 unique samples total) handoff(Claude): Generated notes now normalized to match asset samples. All audio levels consistent.
Diffstat (limited to 'src/audio/gen.cc')
-rw-r--r--src/audio/gen.cc42
1 files changed, 42 insertions, 0 deletions
diff --git a/src/audio/gen.cc b/src/audio/gen.cc
index 74b468c..0757b4d 100644
--- a/src/audio/gen.cc
+++ b/src/audio/gen.cc
@@ -87,6 +87,48 @@ std::vector<float> generate_note_spectrogram(const NoteParams& params,
}
}
+ // Normalize to consistent RMS level (matching spectool --normalize behavior)
+ // 1. Synthesize PCM to measure actual output levels
+ std::vector<float> pcm_data(num_frames * DCT_SIZE);
+ for (int f = 0; f < num_frames; ++f) {
+ const float* spectral_frame = spec_data.data() + (f * DCT_SIZE);
+ float* time_frame = pcm_data.data() + (f * DCT_SIZE);
+ idct_512(spectral_frame, time_frame);
+ }
+
+ // 2. Calculate RMS and peak
+ float rms_sum = 0.0f;
+ float peak = 0.0f;
+ for (size_t i = 0; i < pcm_data.size(); ++i) {
+ const float abs_val = fabsf(pcm_data[i]);
+ if (abs_val > peak) {
+ peak = abs_val;
+ }
+ rms_sum += pcm_data[i] * pcm_data[i];
+ }
+ const float rms = sqrtf(rms_sum / pcm_data.size());
+
+ // 3. Normalize to target RMS (0.15, matching spectool default)
+ const float target_rms = 0.15f;
+ const float max_safe_peak = 1.0f; // Conservative: ensure output peak ≤ 1.0
+
+ if (rms > 1e-6f) {
+ // Calculate scale factor to reach target RMS
+ float norm_scale = target_rms / rms;
+
+ // Check if this would cause clipping
+ const float predicted_peak = peak * norm_scale;
+ if (predicted_peak > max_safe_peak) {
+ // Reduce scale to prevent clipping
+ norm_scale = max_safe_peak / peak;
+ }
+
+ // Apply normalization scale to spectrogram
+ for (size_t i = 0; i < spec_data.size(); ++i) {
+ spec_data[i] *= norm_scale;
+ }
+ }
+
return spec_data;
}