From bb8197075161f9c9ded51beab913150b43954e2c Mon Sep 17 00:00:00 2001 From: skal Date: Mon, 2 Mar 2026 09:38:46 +0100 Subject: feat(audio): OLA-IDCT synthesis with Hann window to eliminate clicks Add v2 spectrogram format (SPEC_VERSION_V2_OLA) using overlap-add IDCT with 50% overlap (hop=256, OLA_OVERLAP=256) and Hann windowing. - dct.h: OLA_HOP_SIZE=256, OLA_OVERLAP=256 - synth.h: SPEC_VERSION_V1/V2_OLA constants; version field on Spectrogram - window.h/cc: hann_window_512() alongside existing hamming_window_512() - synth.cc: g_hann[] precomputed at init; OLA path in synth_render when ola_mode=true (IDCT -> Hann -> add overlap tail -> save new tail -> output OLA_HOP_SIZE samples); v1 path unchanged for backward compat - tracker.cc: MP3 encoder now uses sliding 512-sample Hann window with OLA_HOP_SIZE advance per frame; sets version=SPEC_VERSION_V2_OLA; .spec files propagate header->version; generated notes stay v1 Existing .spec files must be regenerated to benefit from click-free OLA. handoff(Claude): OLA done. .spec files need regen via MP3 tool to activate v2. --- src/audio/dct.h | 4 +++- src/audio/synth.cc | 35 ++++++++++++++++++++++++++++++----- src/audio/synth.h | 4 ++++ src/audio/tracker.cc | 32 +++++++++++++++++++++++--------- src/audio/window.cc | 8 ++++++++ src/audio/window.h | 1 + 6 files changed, 69 insertions(+), 15 deletions(-) (limited to 'src/audio') diff --git a/src/audio/dct.h b/src/audio/dct.h index ee3e9b3..ec9f651 100644 --- a/src/audio/dct.h +++ b/src/audio/dct.h @@ -4,7 +4,9 @@ #pragma once -#define DCT_SIZE 512 +#define DCT_SIZE 512 +#define OLA_HOP_SIZE 256 +#define OLA_OVERLAP 256 // Forward declarations void fdct_512(const float* input, float* output); diff --git a/src/audio/synth.cc b/src/audio/synth.cc index 5fadf3c..a723404 100644 --- a/src/audio/synth.cc +++ b/src/audio/synth.cc @@ -27,6 +27,8 @@ struct Voice { int total_spectral_frames; float time_domain_buffer[DCT_SIZE]; + float overlap_buf[OLA_OVERLAP]; // OLA tail from previous frame (v2 only) + bool ola_mode; // True for SPEC_VERSION_V2_OLA int buffer_pos; float fractional_pos; // Fractional sample position for tempo scaling @@ -45,6 +47,7 @@ static Voice g_voices[MAX_VOICES]; static volatile float g_current_output_peak = 0.0f; // Global peak for visualization static float g_tempo_scale = 1.0f; // Playback speed multiplier +static float g_hann[DCT_SIZE]; // Hann window for OLA synthesis (v2) #if !defined(STRIP_ALL) static float g_elapsed_time_sec = 0.0f; // Tracks elapsed time for event hooks @@ -54,6 +57,7 @@ void synth_init() { memset(&g_synth_data, 0, sizeof(g_synth_data)); memset(g_voices, 0, sizeof(g_voices)); g_current_output_peak = 0.0f; + hann_window_512(g_hann); #if !defined(STRIP_ALL) g_elapsed_time_sec = 0.0f; #endif /* !defined(STRIP_ALL) */ @@ -202,7 +206,11 @@ void synth_trigger_voice(int spectrogram_id, float volume, float pan, v.current_spectral_frame = 0; v.total_spectral_frames = g_synth_data.spectrograms[spectrogram_id].num_frames; - v.buffer_pos = DCT_SIZE; // Force IDCT on first render + v.ola_mode = (g_synth_data.spectrograms[spectrogram_id].version == + SPEC_VERSION_V2_OLA); + v.buffer_pos = v.ola_mode ? OLA_HOP_SIZE : DCT_SIZE; // Force reload on first render + if (v.ola_mode) + memset(v.overlap_buf, 0, sizeof(v.overlap_buf)); v.fractional_pos = 0.0f; // Initialize fractional position for tempo scaling v.start_sample_offset = @@ -243,7 +251,8 @@ void synth_render(float* output_buffer, int num_frames) { continue; // Don't produce audio until offset elapsed } - if (v.buffer_pos >= DCT_SIZE) { + const int frame_threshold = v.ola_mode ? OLA_HOP_SIZE : DCT_SIZE; + if (v.buffer_pos >= frame_threshold) { if (v.current_spectral_frame >= v.total_spectral_frames) { v.active = false; continue; @@ -256,9 +265,25 @@ void synth_render(float* output_buffer, int num_frames) { const float* spectral_frame = (const float*)v.active_spectral_data + (v.current_spectral_frame * DCT_SIZE); - // IDCT directly - no windowing needed for synthesis - // (Window is only used during analysis, before DCT) - idct_512(spectral_frame, v.time_domain_buffer); + if (v.ola_mode) { + // OLA-IDCT synthesis (v2): Hann window + overlap-add + float tmp[DCT_SIZE]; + idct_512(spectral_frame, tmp); + for (int j = 0; j < DCT_SIZE; ++j) + tmp[j] *= g_hann[j]; + // Add saved overlap from previous frame + for (int j = 0; j < OLA_OVERLAP; ++j) + tmp[j] += v.overlap_buf[j]; + // Save new tail as overlap for next frame + for (int j = 0; j < OLA_OVERLAP; ++j) + v.overlap_buf[j] = tmp[OLA_HOP_SIZE + j]; + // Output buffer holds first OLA_HOP_SIZE samples + for (int j = 0; j < OLA_HOP_SIZE; ++j) + v.time_domain_buffer[j] = tmp[j]; + } else { + // V1: IDCT directly, no windowing + idct_512(spectral_frame, v.time_domain_buffer); + } v.buffer_pos = 0; ++v.current_spectral_frame; diff --git a/src/audio/synth.h b/src/audio/synth.h index 3a42a61..61ecfd0 100644 --- a/src/audio/synth.h +++ b/src/audio/synth.h @@ -21,10 +21,14 @@ #define MAX_SPECTROGRAMS \ 32 // Current track: 14 unique, 32 provides comfortable headroom +#define SPEC_VERSION_V1 1 +#define SPEC_VERSION_V2_OLA 2 + struct Spectrogram { const float* spectral_data_a; // Front buffer const float* spectral_data_b; // Back buffer (for double-buffering) int num_frames; + int version; // SPEC_VERSION_V1 or SPEC_VERSION_V2_OLA }; void synth_init(); diff --git a/src/audio/tracker.cc b/src/audio/tracker.cc index 59801a8..333a337 100644 --- a/src/audio/tracker.cc +++ b/src/audio/tracker.cc @@ -56,6 +56,7 @@ static bool is_mp3_asset(const uint8_t* data, size_t size) { #if !defined(STRIP_ALL) // Decode an in-memory MP3 blob to a heap-allocated spectrogram (caller owns). +// Uses OLA analysis: 512-sample Hann window, OLA_HOP_SIZE advance per frame. // Returns nullptr on error. Sets *out_num_frames to frame count. static float* convert_mp3_to_spectrogram(const uint8_t* data, size_t size, int* out_num_frames) { @@ -64,23 +65,33 @@ static float* convert_mp3_to_spectrogram(const uint8_t* data, size_t size, if (!dec) return nullptr; float window[DCT_SIZE]; - hamming_window_512(window); + hann_window_512(window); std::vector spec_data; + float pcm_buf[DCT_SIZE]; float pcm_chunk[DCT_SIZE]; + float dct_chunk[DCT_SIZE]; + + // Sliding-window OLA analysis: advance OLA_HOP_SIZE samples per frame. + // First iteration: pcm_buf is zero-initialized (silence before signal start). + memset(pcm_buf, 0, sizeof(pcm_buf)); for (;;) { - const int decoded = mp3_decode(dec, DCT_SIZE, pcm_chunk); - if (decoded == 0) break; - if (decoded < DCT_SIZE) { - memset(pcm_chunk + decoded, 0, (DCT_SIZE - decoded) * sizeof(float)); - } - for (int i = 0; i < DCT_SIZE; ++i) { - pcm_chunk[i] *= window[i]; + // Slide left by OLA_HOP_SIZE; fill right half with new samples. + memmove(pcm_buf, pcm_buf + OLA_HOP_SIZE, OLA_HOP_SIZE * sizeof(float)); + const int decoded = mp3_decode(dec, OLA_HOP_SIZE, pcm_buf + OLA_HOP_SIZE); + if (decoded < OLA_HOP_SIZE) { + memset(pcm_buf + OLA_HOP_SIZE + decoded, 0, + (OLA_HOP_SIZE - decoded) * sizeof(float)); } - float dct_chunk[DCT_SIZE]; + + // Window + DCT the current 512-sample frame. + for (int i = 0; i < DCT_SIZE; ++i) + pcm_chunk[i] = pcm_buf[i] * window[i]; fdct_512(pcm_chunk, dct_chunk); spec_data.insert(spec_data.end(), dct_chunk, dct_chunk + DCT_SIZE); + + if (decoded == 0) break; } mp3_close(dec); @@ -148,6 +159,7 @@ void tracker_init() { spec.spectral_data_a = spec_data; spec.spectral_data_b = spec_data; spec.num_frames = num_frames; + spec.version = SPEC_VERSION_V2_OLA; g_sample_synth_cache[sid] = synth_register_spectrogram(&spec); g_spec_pool[slot].synth_id = g_sample_synth_cache[sid]; } @@ -166,6 +178,7 @@ void tracker_init() { spec.spectral_data_a = spectral_data; spec.spectral_data_b = spectral_data; spec.num_frames = note_frames; + spec.version = header->version; g_sample_synth_cache[sid] = synth_register_spectrogram(&spec); @@ -195,6 +208,7 @@ void tracker_init() { spec.spectral_data_a = g_spec_pool[slot].data; spec.spectral_data_b = g_spec_pool[slot].data; spec.num_frames = note_frames; + spec.version = SPEC_VERSION_V1; g_sample_synth_cache[sid] = synth_register_spectrogram(&spec); g_spec_pool[slot].synth_id = g_sample_synth_cache[sid]; diff --git a/src/audio/window.cc b/src/audio/window.cc index b68c747..bcdd768 100644 --- a/src/audio/window.cc +++ b/src/audio/window.cc @@ -12,3 +12,11 @@ void hamming_window_512(float* window) { 0.54f - 0.46f * cosf(2.0f * PI * (float)i / (float)(WINDOW_SIZE - 1)); } } + +void hann_window_512(float* window) { + const float PI = 3.14159265358979323846f; + for (int i = 0; i < WINDOW_SIZE; ++i) { + window[i] = + 0.5f - 0.5f * cosf(2.0f * PI * (float)i / (float)(WINDOW_SIZE - 1)); + } +} diff --git a/src/audio/window.h b/src/audio/window.h index c3b583a..80253da 100644 --- a/src/audio/window.h +++ b/src/audio/window.h @@ -7,3 +7,4 @@ #define WINDOW_SIZE 512 void hamming_window_512(float* window); +void hann_window_512(float* window); -- cgit v1.2.3