diff options
Diffstat (limited to 'src/audio')
| -rw-r--r-- | src/audio/fdct.cc | 13 | ||||
| -rw-r--r-- | src/audio/fft.cc | 91 | ||||
| -rw-r--r-- | src/audio/gen.cc | 58 | ||||
| -rw-r--r-- | src/audio/idct.cc | 13 | ||||
| -rw-r--r-- | src/audio/synth.cc | 21 |
5 files changed, 109 insertions, 87 deletions
diff --git a/src/audio/fdct.cc b/src/audio/fdct.cc index 78973a3..9eaa2dd 100644 --- a/src/audio/fdct.cc +++ b/src/audio/fdct.cc @@ -3,16 +3,9 @@ // Used for analyzing audio files into spectrograms. #include "dct.h" -#include <math.h> +#include "fft.h" +// Fast O(N log N) implementation using FFT void fdct_512(const float* input, float* output) { - const float PI = 3.14159265358979323846f; - for (int k = 0; k < DCT_SIZE; ++k) { - float sum = 0.0f; - for (int n = 0; n < DCT_SIZE; ++n) { - sum += - input[n] * cosf(PI / (float)DCT_SIZE * ((float)n + 0.5f) * (float)k); - } - output[k] = sum; - } + dct_fft(input, output, DCT_SIZE); } diff --git a/src/audio/fft.cc b/src/audio/fft.cc index 25477b9..3f8e706 100644 --- a/src/audio/fft.cc +++ b/src/audio/fft.cc @@ -97,47 +97,42 @@ void fft_inverse(float* real, float* imag, size_t N) { } } -// DCT-II via FFT using double-and-mirror method -// This is a more robust algorithm that avoids reordering issues -// Reference: Numerical Recipes, Press et al. +// DCT-II via FFT using reordering method +// Reference: Numerical Recipes Chapter 12.3, Press et al. void dct_fft(const float* input, float* output, size_t N) { const float PI = 3.14159265358979323846f; - // Allocate temporary arrays for 2N-point FFT - const size_t M = 2 * N; - float* real = new float[M]; - float* imag = new float[M]; + // Allocate temporary arrays for N-point FFT + float* real = new float[N]; + float* imag = new float[N]; - // Pack input: [x[0], x[1], ..., x[N-1], x[N-1], x[N-2], ..., x[1]] - // This creates even symmetry for real-valued DCT - for (size_t i = 0; i < N; i++) { - real[i] = input[i]; - } - for (size_t i = 0; i < N; i++) { - real[N + i] = input[N - 1 - i]; + // Reorder input: even indices first, then odd indices reversed + // [x[0], x[2], x[4], ...] followed by [x[N-1], x[N-3], x[N-5], ...] + for (size_t i = 0; i < N / 2; i++) { + real[i] = input[2 * i]; // Even indices: 0, 2, 4, ... + real[N - 1 - i] = input[2 * i + 1]; // Odd indices reversed: N-1, N-3, ... } - memset(imag, 0, M * sizeof(float)); + memset(imag, 0, N * sizeof(float)); - // Apply 2N-point FFT - fft_forward(real, imag, M); + // Apply N-point FFT + fft_forward(real, imag, N); - // Extract DCT coefficients + // Extract DCT coefficients with phase correction // DCT[k] = Re{FFT[k] * exp(-j*pi*k/(2*N))} * normalization - // Note: Need to divide by 2 because we doubled the signal length for (size_t k = 0; k < N; k++) { const float angle = -PI * k / (2.0f * N); const float wr = cosf(angle); const float wi = sinf(angle); - // Complex multiplication: (real + j*imag) * (wr + j*wi) + // Complex multiplication: (real[k] + j*imag[k]) * (wr + j*wi) // Real part: real*wr - imag*wi const float dct_value = real[k] * wr - imag[k] * wi; - // Apply DCT-II normalization (divide by 2 for double-length FFT) + // Apply DCT-II normalization if (k == 0) { - output[k] = dct_value * sqrtf(1.0f / N) / 2.0f; + output[k] = dct_value * sqrtf(1.0f / N); } else { - output[k] = dct_value * sqrtf(2.0f / N) / 2.0f; + output[k] = dct_value * sqrtf(2.0f / N); } } @@ -145,48 +140,46 @@ void dct_fft(const float* input, float* output, size_t N) { delete[] imag; } -// IDCT (Inverse DCT-II) via FFT using double-and-mirror method -// This is the inverse of the DCT-II (used for synthesis) +// IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method +// Reference: Numerical Recipes Chapter 12.3, Press et al. void idct_fft(const float* input, float* output, size_t N) { const float PI = 3.14159265358979323846f; - // Allocate temporary arrays for 2N-point FFT - const size_t M = 2 * N; - float* real = new float[M]; - float* imag = new float[M]; + // Allocate temporary arrays for N-point FFT + float* real = new float[N]; + float* imag = new float[N]; - // Prepare FFT input from DCT coefficients - // IDCT = Re{IFFT[DCT * exp(j*pi*k/(2*N))]} * 2 + // Prepare FFT input with inverse phase correction + // FFT[k] = DCT[k] * exp(+j*pi*k/(2*N)) / normalization + // Note: DCT-III (inverse of DCT-II) requires factor of 2 for AC terms for (size_t k = 0; k < N; k++) { - const float angle = PI * k / (2.0f * N); // Positive for inverse + const float angle = PI * k / (2.0f * N); // Positive angle for inverse const float wr = cosf(angle); const float wi = sinf(angle); - // Apply inverse normalization - float scaled_input; + // Inverse of DCT-II normalization with correct DCT-III scaling + float scaled; if (k == 0) { - scaled_input = input[k] * sqrtf(N) * 2.0f; + scaled = input[k] / sqrtf(1.0f / N); } else { - scaled_input = input[k] * sqrtf(N / 2.0f) * 2.0f; + // DCT-III needs factor of 2 for AC terms + scaled = input[k] / sqrtf(2.0f / N) * 2.0f; } - // Complex multiplication: DCT[k] * exp(j*theta) - real[k] = scaled_input * wr; - imag[k] = scaled_input * wi; - } - - // Fill second half with conjugate symmetry (for real output) - for (size_t k = 1; k < N; k++) { - real[M - k] = real[k]; - imag[M - k] = -imag[k]; + // Complex multiplication: scaled * (wr + j*wi) + real[k] = scaled * wr; + imag[k] = scaled * wi; } // Apply inverse FFT - fft_inverse(real, imag, M); + fft_inverse(real, imag, N); - // Extract first N samples (real part only, imag should be ~0) - for (size_t i = 0; i < N; i++) { - output[i] = real[i]; + // Unpack: reverse the reordering from DCT + // Even output indices come from first half of FFT output + // Odd output indices come from second half (reversed) + for (size_t i = 0; i < N / 2; i++) { + output[2 * i] = real[i]; // Even positions + output[2 * i + 1] = real[N - 1 - i]; // Odd positions (reversed) } delete[] real; diff --git a/src/audio/gen.cc b/src/audio/gen.cc index 148fc68..0757b4d 100644 --- a/src/audio/gen.cc +++ b/src/audio/gen.cc @@ -69,9 +69,63 @@ std::vector<float> generate_note_spectrogram(const NoteParams& params, float dct_chunk[DCT_SIZE]; fdct_512(pcm_chunk, dct_chunk); - // Copy to buffer + // Scale up to compensate for orthonormal normalization + // Old non-orthonormal DCT had no sqrt scaling, so output was ~sqrt(N/2) larger + // Scale factor: sqrt(DCT_SIZE / 2) = sqrt(256) = 16 + // + // HOWEVER: After removing synthesis windowing (commit f998bfc), audio is louder. + // The old synthesis incorrectly applied Hamming window to spectrum (reducing energy by 0.63x). + // New synthesis is correct (no window), but procedural notes with 16x scaling are too loud. + // + // Analysis applies Hamming window (0.63x energy). With 16x scaling: 0.63 × 16 ≈ 10x. + // Divide by 2.5 to match the relative loudness increase: 16 / 2.5 = 6.4 + const float scale_factor = sqrtf(DCT_SIZE / 2.0f) / 2.5f; + + // Copy to buffer with scaling for (int i = 0; i < DCT_SIZE; ++i) { - spec_data[f * DCT_SIZE + i] = dct_chunk[i]; + spec_data[f * DCT_SIZE + i] = dct_chunk[i] * scale_factor; + } + } + + // Normalize to consistent RMS level (matching spectool --normalize behavior) + // 1. Synthesize PCM to measure actual output levels + std::vector<float> pcm_data(num_frames * DCT_SIZE); + for (int f = 0; f < num_frames; ++f) { + const float* spectral_frame = spec_data.data() + (f * DCT_SIZE); + float* time_frame = pcm_data.data() + (f * DCT_SIZE); + idct_512(spectral_frame, time_frame); + } + + // 2. Calculate RMS and peak + float rms_sum = 0.0f; + float peak = 0.0f; + for (size_t i = 0; i < pcm_data.size(); ++i) { + const float abs_val = fabsf(pcm_data[i]); + if (abs_val > peak) { + peak = abs_val; + } + rms_sum += pcm_data[i] * pcm_data[i]; + } + const float rms = sqrtf(rms_sum / pcm_data.size()); + + // 3. Normalize to target RMS (0.15, matching spectool default) + const float target_rms = 0.15f; + const float max_safe_peak = 1.0f; // Conservative: ensure output peak ≤ 1.0 + + if (rms > 1e-6f) { + // Calculate scale factor to reach target RMS + float norm_scale = target_rms / rms; + + // Check if this would cause clipping + const float predicted_peak = peak * norm_scale; + if (predicted_peak > max_safe_peak) { + // Reduce scale to prevent clipping + norm_scale = max_safe_peak / peak; + } + + // Apply normalization scale to spectrogram + for (size_t i = 0; i < spec_data.size(); ++i) { + spec_data[i] *= norm_scale; } } diff --git a/src/audio/idct.cc b/src/audio/idct.cc index f8e8769..4566f99 100644 --- a/src/audio/idct.cc +++ b/src/audio/idct.cc @@ -3,16 +3,9 @@ // Used for real-time synthesis of audio from spectral data. #include "dct.h" -#include <math.h> +#include "fft.h" +// Fast O(N log N) implementation using FFT void idct_512(const float* input, float* output) { - const float PI = 3.14159265358979323846f; - for (int n = 0; n < DCT_SIZE; ++n) { - float sum = input[0] / 2.0f; - for (int k = 1; k < DCT_SIZE; ++k) { - sum += - input[k] * cosf(PI / (float)DCT_SIZE * (float)k * ((float)n + 0.5f)); - } - output[n] = sum * (2.0f / (float)DCT_SIZE); - } + idct_fft(input, output, DCT_SIZE); } diff --git a/src/audio/synth.cc b/src/audio/synth.cc index ada46fd..2072bb4 100644 --- a/src/audio/synth.cc +++ b/src/audio/synth.cc @@ -41,9 +41,8 @@ static struct { static Voice g_voices[MAX_VOICES]; static volatile float g_current_output_peak = - 0.0f; // Global peak for visualization -static float g_hamming_window[WINDOW_SIZE]; // Static window for optimization -static float g_tempo_scale = 1.0f; // Playback speed multiplier + 0.0f; // Global peak for visualization +static float g_tempo_scale = 1.0f; // Playback speed multiplier #if !defined(STRIP_ALL) static float g_elapsed_time_sec = 0.0f; // Tracks elapsed time for event hooks @@ -56,8 +55,6 @@ void synth_init() { #if !defined(STRIP_ALL) g_elapsed_time_sec = 0.0f; #endif /* !defined(STRIP_ALL) */ - // Initialize the Hamming window once - hamming_window_512(g_hamming_window); } void synth_shutdown() { @@ -214,10 +211,6 @@ void synth_trigger_voice(int spectrogram_id, float volume, float pan) { } void synth_render(float* output_buffer, int num_frames) { - // Use the pre-calculated window - // float window[WINDOW_SIZE]; - // hamming_window_512(window); - // Faster decay for more responsive visuals g_current_output_peak *= 0.90f; @@ -243,13 +236,9 @@ void synth_render(float* output_buffer, int num_frames) { const float* spectral_frame = (const float*)v.active_spectral_data + (v.current_spectral_frame * DCT_SIZE); - float windowed_frame[DCT_SIZE]; - for (int j = 0; j < DCT_SIZE; ++j) { - windowed_frame[j] = - spectral_frame[j] * g_hamming_window[j]; // Use static window - } - - idct_512(windowed_frame, v.time_domain_buffer); + // IDCT directly - no windowing needed for synthesis + // (Window is only used during analysis, before DCT) + idct_512(spectral_frame, v.time_domain_buffer); v.buffer_pos = 0; ++v.current_spectral_frame; |
