summaryrefslogtreecommitdiff
path: root/src/audio
diff options
context:
space:
mode:
Diffstat (limited to 'src/audio')
-rw-r--r--src/audio/fdct.cc13
-rw-r--r--src/audio/fft.cc91
-rw-r--r--src/audio/gen.cc58
-rw-r--r--src/audio/idct.cc13
-rw-r--r--src/audio/synth.cc21
5 files changed, 109 insertions, 87 deletions
diff --git a/src/audio/fdct.cc b/src/audio/fdct.cc
index 78973a3..9eaa2dd 100644
--- a/src/audio/fdct.cc
+++ b/src/audio/fdct.cc
@@ -3,16 +3,9 @@
// Used for analyzing audio files into spectrograms.
#include "dct.h"
-#include <math.h>
+#include "fft.h"
+// Fast O(N log N) implementation using FFT
void fdct_512(const float* input, float* output) {
- const float PI = 3.14159265358979323846f;
- for (int k = 0; k < DCT_SIZE; ++k) {
- float sum = 0.0f;
- for (int n = 0; n < DCT_SIZE; ++n) {
- sum +=
- input[n] * cosf(PI / (float)DCT_SIZE * ((float)n + 0.5f) * (float)k);
- }
- output[k] = sum;
- }
+ dct_fft(input, output, DCT_SIZE);
}
diff --git a/src/audio/fft.cc b/src/audio/fft.cc
index 25477b9..3f8e706 100644
--- a/src/audio/fft.cc
+++ b/src/audio/fft.cc
@@ -97,47 +97,42 @@ void fft_inverse(float* real, float* imag, size_t N) {
}
}
-// DCT-II via FFT using double-and-mirror method
-// This is a more robust algorithm that avoids reordering issues
-// Reference: Numerical Recipes, Press et al.
+// DCT-II via FFT using reordering method
+// Reference: Numerical Recipes Chapter 12.3, Press et al.
void dct_fft(const float* input, float* output, size_t N) {
const float PI = 3.14159265358979323846f;
- // Allocate temporary arrays for 2N-point FFT
- const size_t M = 2 * N;
- float* real = new float[M];
- float* imag = new float[M];
+ // Allocate temporary arrays for N-point FFT
+ float* real = new float[N];
+ float* imag = new float[N];
- // Pack input: [x[0], x[1], ..., x[N-1], x[N-1], x[N-2], ..., x[1]]
- // This creates even symmetry for real-valued DCT
- for (size_t i = 0; i < N; i++) {
- real[i] = input[i];
- }
- for (size_t i = 0; i < N; i++) {
- real[N + i] = input[N - 1 - i];
+ // Reorder input: even indices first, then odd indices reversed
+ // [x[0], x[2], x[4], ...] followed by [x[N-1], x[N-3], x[N-5], ...]
+ for (size_t i = 0; i < N / 2; i++) {
+ real[i] = input[2 * i]; // Even indices: 0, 2, 4, ...
+ real[N - 1 - i] = input[2 * i + 1]; // Odd indices reversed: N-1, N-3, ...
}
- memset(imag, 0, M * sizeof(float));
+ memset(imag, 0, N * sizeof(float));
- // Apply 2N-point FFT
- fft_forward(real, imag, M);
+ // Apply N-point FFT
+ fft_forward(real, imag, N);
- // Extract DCT coefficients
+ // Extract DCT coefficients with phase correction
// DCT[k] = Re{FFT[k] * exp(-j*pi*k/(2*N))} * normalization
- // Note: Need to divide by 2 because we doubled the signal length
for (size_t k = 0; k < N; k++) {
const float angle = -PI * k / (2.0f * N);
const float wr = cosf(angle);
const float wi = sinf(angle);
- // Complex multiplication: (real + j*imag) * (wr + j*wi)
+ // Complex multiplication: (real[k] + j*imag[k]) * (wr + j*wi)
// Real part: real*wr - imag*wi
const float dct_value = real[k] * wr - imag[k] * wi;
- // Apply DCT-II normalization (divide by 2 for double-length FFT)
+ // Apply DCT-II normalization
if (k == 0) {
- output[k] = dct_value * sqrtf(1.0f / N) / 2.0f;
+ output[k] = dct_value * sqrtf(1.0f / N);
} else {
- output[k] = dct_value * sqrtf(2.0f / N) / 2.0f;
+ output[k] = dct_value * sqrtf(2.0f / N);
}
}
@@ -145,48 +140,46 @@ void dct_fft(const float* input, float* output, size_t N) {
delete[] imag;
}
-// IDCT (Inverse DCT-II) via FFT using double-and-mirror method
-// This is the inverse of the DCT-II (used for synthesis)
+// IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method
+// Reference: Numerical Recipes Chapter 12.3, Press et al.
void idct_fft(const float* input, float* output, size_t N) {
const float PI = 3.14159265358979323846f;
- // Allocate temporary arrays for 2N-point FFT
- const size_t M = 2 * N;
- float* real = new float[M];
- float* imag = new float[M];
+ // Allocate temporary arrays for N-point FFT
+ float* real = new float[N];
+ float* imag = new float[N];
- // Prepare FFT input from DCT coefficients
- // IDCT = Re{IFFT[DCT * exp(j*pi*k/(2*N))]} * 2
+ // Prepare FFT input with inverse phase correction
+ // FFT[k] = DCT[k] * exp(+j*pi*k/(2*N)) / normalization
+ // Note: DCT-III (inverse of DCT-II) requires factor of 2 for AC terms
for (size_t k = 0; k < N; k++) {
- const float angle = PI * k / (2.0f * N); // Positive for inverse
+ const float angle = PI * k / (2.0f * N); // Positive angle for inverse
const float wr = cosf(angle);
const float wi = sinf(angle);
- // Apply inverse normalization
- float scaled_input;
+ // Inverse of DCT-II normalization with correct DCT-III scaling
+ float scaled;
if (k == 0) {
- scaled_input = input[k] * sqrtf(N) * 2.0f;
+ scaled = input[k] / sqrtf(1.0f / N);
} else {
- scaled_input = input[k] * sqrtf(N / 2.0f) * 2.0f;
+ // DCT-III needs factor of 2 for AC terms
+ scaled = input[k] / sqrtf(2.0f / N) * 2.0f;
}
- // Complex multiplication: DCT[k] * exp(j*theta)
- real[k] = scaled_input * wr;
- imag[k] = scaled_input * wi;
- }
-
- // Fill second half with conjugate symmetry (for real output)
- for (size_t k = 1; k < N; k++) {
- real[M - k] = real[k];
- imag[M - k] = -imag[k];
+ // Complex multiplication: scaled * (wr + j*wi)
+ real[k] = scaled * wr;
+ imag[k] = scaled * wi;
}
// Apply inverse FFT
- fft_inverse(real, imag, M);
+ fft_inverse(real, imag, N);
- // Extract first N samples (real part only, imag should be ~0)
- for (size_t i = 0; i < N; i++) {
- output[i] = real[i];
+ // Unpack: reverse the reordering from DCT
+ // Even output indices come from first half of FFT output
+ // Odd output indices come from second half (reversed)
+ for (size_t i = 0; i < N / 2; i++) {
+ output[2 * i] = real[i]; // Even positions
+ output[2 * i + 1] = real[N - 1 - i]; // Odd positions (reversed)
}
delete[] real;
diff --git a/src/audio/gen.cc b/src/audio/gen.cc
index 148fc68..0757b4d 100644
--- a/src/audio/gen.cc
+++ b/src/audio/gen.cc
@@ -69,9 +69,63 @@ std::vector<float> generate_note_spectrogram(const NoteParams& params,
float dct_chunk[DCT_SIZE];
fdct_512(pcm_chunk, dct_chunk);
- // Copy to buffer
+ // Scale up to compensate for orthonormal normalization
+ // Old non-orthonormal DCT had no sqrt scaling, so output was ~sqrt(N/2) larger
+ // Scale factor: sqrt(DCT_SIZE / 2) = sqrt(256) = 16
+ //
+ // HOWEVER: After removing synthesis windowing (commit f998bfc), audio is louder.
+ // The old synthesis incorrectly applied Hamming window to spectrum (reducing energy by 0.63x).
+ // New synthesis is correct (no window), but procedural notes with 16x scaling are too loud.
+ //
+ // Analysis applies Hamming window (0.63x energy). With 16x scaling: 0.63 × 16 ≈ 10x.
+ // Divide by 2.5 to match the relative loudness increase: 16 / 2.5 = 6.4
+ const float scale_factor = sqrtf(DCT_SIZE / 2.0f) / 2.5f;
+
+ // Copy to buffer with scaling
for (int i = 0; i < DCT_SIZE; ++i) {
- spec_data[f * DCT_SIZE + i] = dct_chunk[i];
+ spec_data[f * DCT_SIZE + i] = dct_chunk[i] * scale_factor;
+ }
+ }
+
+ // Normalize to consistent RMS level (matching spectool --normalize behavior)
+ // 1. Synthesize PCM to measure actual output levels
+ std::vector<float> pcm_data(num_frames * DCT_SIZE);
+ for (int f = 0; f < num_frames; ++f) {
+ const float* spectral_frame = spec_data.data() + (f * DCT_SIZE);
+ float* time_frame = pcm_data.data() + (f * DCT_SIZE);
+ idct_512(spectral_frame, time_frame);
+ }
+
+ // 2. Calculate RMS and peak
+ float rms_sum = 0.0f;
+ float peak = 0.0f;
+ for (size_t i = 0; i < pcm_data.size(); ++i) {
+ const float abs_val = fabsf(pcm_data[i]);
+ if (abs_val > peak) {
+ peak = abs_val;
+ }
+ rms_sum += pcm_data[i] * pcm_data[i];
+ }
+ const float rms = sqrtf(rms_sum / pcm_data.size());
+
+ // 3. Normalize to target RMS (0.15, matching spectool default)
+ const float target_rms = 0.15f;
+ const float max_safe_peak = 1.0f; // Conservative: ensure output peak ≤ 1.0
+
+ if (rms > 1e-6f) {
+ // Calculate scale factor to reach target RMS
+ float norm_scale = target_rms / rms;
+
+ // Check if this would cause clipping
+ const float predicted_peak = peak * norm_scale;
+ if (predicted_peak > max_safe_peak) {
+ // Reduce scale to prevent clipping
+ norm_scale = max_safe_peak / peak;
+ }
+
+ // Apply normalization scale to spectrogram
+ for (size_t i = 0; i < spec_data.size(); ++i) {
+ spec_data[i] *= norm_scale;
}
}
diff --git a/src/audio/idct.cc b/src/audio/idct.cc
index f8e8769..4566f99 100644
--- a/src/audio/idct.cc
+++ b/src/audio/idct.cc
@@ -3,16 +3,9 @@
// Used for real-time synthesis of audio from spectral data.
#include "dct.h"
-#include <math.h>
+#include "fft.h"
+// Fast O(N log N) implementation using FFT
void idct_512(const float* input, float* output) {
- const float PI = 3.14159265358979323846f;
- for (int n = 0; n < DCT_SIZE; ++n) {
- float sum = input[0] / 2.0f;
- for (int k = 1; k < DCT_SIZE; ++k) {
- sum +=
- input[k] * cosf(PI / (float)DCT_SIZE * (float)k * ((float)n + 0.5f));
- }
- output[n] = sum * (2.0f / (float)DCT_SIZE);
- }
+ idct_fft(input, output, DCT_SIZE);
}
diff --git a/src/audio/synth.cc b/src/audio/synth.cc
index ada46fd..2072bb4 100644
--- a/src/audio/synth.cc
+++ b/src/audio/synth.cc
@@ -41,9 +41,8 @@ static struct {
static Voice g_voices[MAX_VOICES];
static volatile float g_current_output_peak =
- 0.0f; // Global peak for visualization
-static float g_hamming_window[WINDOW_SIZE]; // Static window for optimization
-static float g_tempo_scale = 1.0f; // Playback speed multiplier
+ 0.0f; // Global peak for visualization
+static float g_tempo_scale = 1.0f; // Playback speed multiplier
#if !defined(STRIP_ALL)
static float g_elapsed_time_sec = 0.0f; // Tracks elapsed time for event hooks
@@ -56,8 +55,6 @@ void synth_init() {
#if !defined(STRIP_ALL)
g_elapsed_time_sec = 0.0f;
#endif /* !defined(STRIP_ALL) */
- // Initialize the Hamming window once
- hamming_window_512(g_hamming_window);
}
void synth_shutdown() {
@@ -214,10 +211,6 @@ void synth_trigger_voice(int spectrogram_id, float volume, float pan) {
}
void synth_render(float* output_buffer, int num_frames) {
- // Use the pre-calculated window
- // float window[WINDOW_SIZE];
- // hamming_window_512(window);
-
// Faster decay for more responsive visuals
g_current_output_peak *= 0.90f;
@@ -243,13 +236,9 @@ void synth_render(float* output_buffer, int num_frames) {
const float* spectral_frame = (const float*)v.active_spectral_data +
(v.current_spectral_frame * DCT_SIZE);
- float windowed_frame[DCT_SIZE];
- for (int j = 0; j < DCT_SIZE; ++j) {
- windowed_frame[j] =
- spectral_frame[j] * g_hamming_window[j]; // Use static window
- }
-
- idct_512(windowed_frame, v.time_domain_buffer);
+ // IDCT directly - no windowing needed for synthesis
+ // (Window is only used during analysis, before DCT)
+ idct_512(spectral_frame, v.time_domain_buffer);
v.buffer_pos = 0;
++v.current_spectral_frame;