5 files changed, 109 insertions, 87 deletions
diff --git a/src/audio/fdct.cc b/src/audio/fdct.cc
index 78973a3..9eaa2dd 100644
--- a/src/audio/fdct.cc
+++ b/src/audio/fdct.cc
@@ -3,16 +3,9 @@
 // Used for analyzing audio files into spectrograms.
 
 #include "dct.h"
-#include <math.h>
+#include "fft.h"
 
+// Fast O(N log N) implementation using FFT
 void fdct_512(const float* input, float* output) {
-  const float PI = 3.14159265358979323846f;
-  for (int k = 0; k < DCT_SIZE; ++k) {
-    float sum = 0.0f;
-    for (int n = 0; n < DCT_SIZE; ++n) {
-      sum +=
-          input[n] * cosf(PI / (float)DCT_SIZE * ((float)n + 0.5f) * (float)k);
-    }
-    output[k] = sum;
-  }
+  dct_fft(input, output, DCT_SIZE);
 }
diff --git a/src/audio/fft.cc b/src/audio/fft.cc
index 25477b9..3f8e706 100644
--- a/src/audio/fft.cc
+++ b/src/audio/fft.cc
@@ -97,47 +97,42 @@ void fft_inverse(float* real, float* imag, size_t N) {
   }
 }
 
-// DCT-II via FFT using double-and-mirror method
-// This is a more robust algorithm that avoids reordering issues
-// Reference: Numerical Recipes, Press et al.
+// DCT-II via FFT using reordering method
+// Reference: Numerical Recipes Chapter 12.3, Press et al.
 void dct_fft(const float* input, float* output, size_t N) {
   const float PI = 3.14159265358979323846f;
 
-  // Allocate temporary arrays for 2N-point FFT
-  const size_t M = 2 * N;
-  float* real = new float[M];
-  float* imag = new float[M];
+  // Allocate temporary arrays for N-point FFT
+  float* real = new float[N];
+  float* imag = new float[N];
 
-  // Pack input: [x[0], x[1], ..., x[N-1], x[N-1], x[N-2], ..., x[1]]
-  // This creates even symmetry for real-valued DCT
-  for (size_t i = 0; i < N; i++) {
-    real[i] = input[i];
-  }
-  for (size_t i = 0; i < N; i++) {
-    real[N + i] = input[N - 1 - i];
+  // Reorder input: even indices first, then odd indices reversed
+  // [x[0], x[2], x[4], ...] followed by [x[N-1], x[N-3], x[N-5], ...]
+  for (size_t i = 0; i < N / 2; i++) {
+    real[i] = input[2 * i];           // Even indices: 0, 2, 4, ...
+    real[N - 1 - i] = input[2 * i + 1]; // Odd indices reversed: N-1, N-3, ...
   }
-  memset(imag, 0, M * sizeof(float));
+  memset(imag, 0, N * sizeof(float));
 
-  // Apply 2N-point FFT
-  fft_forward(real, imag, M);
+  // Apply N-point FFT
+  fft_forward(real, imag, N);
 
-  // Extract DCT coefficients
+  // Extract DCT coefficients with phase correction
   // DCT[k] = Re{FFT[k] * exp(-j*pi*k/(2*N))} * normalization
-  // Note: Need to divide by 2 because we doubled the signal length
   for (size_t k = 0; k < N; k++) {
     const float angle = -PI * k / (2.0f * N);
     const float wr = cosf(angle);
     const float wi = sinf(angle);
 
-    // Complex multiplication: (real + j*imag) * (wr + j*wi)
+    // Complex multiplication: (real[k] + j*imag[k]) * (wr + j*wi)
     // Real part: real*wr - imag*wi
     const float dct_value = real[k] * wr - imag[k] * wi;
 
-    // Apply DCT-II normalization (divide by 2 for double-length FFT)
+    // Apply DCT-II normalization
     if (k == 0) {
-      output[k] = dct_value * sqrtf(1.0f / N) / 2.0f;
+      output[k] = dct_value * sqrtf(1.0f / N);
     } else {
-      output[k] = dct_value * sqrtf(2.0f / N) / 2.0f;
+      output[k] = dct_value * sqrtf(2.0f / N);
     }
   }
 
@@ -145,48 +140,46 @@ void dct_fft(const float* input, float* output, size_t N) {
   delete[] imag;
 }
 
-// IDCT (Inverse DCT-II) via FFT using double-and-mirror method
-// This is the inverse of the DCT-II (used for synthesis)
+// IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method
+// Reference: Numerical Recipes Chapter 12.3, Press et al.
 void idct_fft(const float* input, float* output, size_t N) {
   const float PI = 3.14159265358979323846f;
 
-  // Allocate temporary arrays for 2N-point FFT
-  const size_t M = 2 * N;
-  float* real = new float[M];
-  float* imag = new float[M];
+  // Allocate temporary arrays for N-point FFT
+  float* real = new float[N];
+  float* imag = new float[N];
 
-  // Prepare FFT input from DCT coefficients
-  // IDCT = Re{IFFT[DCT * exp(j*pi*k/(2*N))]} * 2
+  // Prepare FFT input with inverse phase correction
+  // FFT[k] = DCT[k] * exp(+j*pi*k/(2*N)) / normalization
+  // Note: DCT-III (inverse of DCT-II) requires factor of 2 for AC terms
   for (size_t k = 0; k < N; k++) {
-    const float angle = PI * k / (2.0f * N);  // Positive for inverse
+    const float angle = PI * k / (2.0f * N);  // Positive angle for inverse
     const float wr = cosf(angle);
     const float wi = sinf(angle);
 
-    // Apply inverse normalization
-    float scaled_input;
+    // Inverse of DCT-II normalization with correct DCT-III scaling
+    float scaled;
     if (k == 0) {
-      scaled_input = input[k] * sqrtf(N) * 2.0f;
+      scaled = input[k] / sqrtf(1.0f / N);
     } else {
-      scaled_input = input[k] * sqrtf(N / 2.0f) * 2.0f;
+      // DCT-III needs factor of 2 for AC terms
+      scaled = input[k] / sqrtf(2.0f / N) * 2.0f;
     }
 
-    // Complex multiplication: DCT[k] * exp(j*theta)
-    real[k] = scaled_input * wr;
-    imag[k] = scaled_input * wi;
-  }
-
-  // Fill second half with conjugate symmetry (for real output)
-  for (size_t k = 1; k < N; k++) {
-    real[M - k] = real[k];
-    imag[M - k] = -imag[k];
+    // Complex multiplication: scaled * (wr + j*wi)
+    real[k] = scaled * wr;
+    imag[k] = scaled * wi;
   }
 
   // Apply inverse FFT
-  fft_inverse(real, imag, M);
+  fft_inverse(real, imag, N);
 
-  // Extract first N samples (real part only, imag should be ~0)
-  for (size_t i = 0; i < N; i++) {
-    output[i] = real[i];
+  // Unpack: reverse the reordering from DCT
+  // Even output indices come from first half of FFT output
+  // Odd output indices come from second half (reversed)
+  for (size_t i = 0; i < N / 2; i++) {
+    output[2 * i] = real[i];              // Even positions
+    output[2 * i + 1] = real[N - 1 - i];  // Odd positions (reversed)
   }
 
   delete[] real;
diff --git a/src/audio/gen.cc b/src/audio/gen.cc
index 148fc68..0757b4d 100644
--- a/src/audio/gen.cc
+++ b/src/audio/gen.cc
@@ -69,9 +69,63 @@ std::vector<float> generate_note_spectrogram(const NoteParams& params,
     float dct_chunk[DCT_SIZE];
     fdct_512(pcm_chunk, dct_chunk);
 
-    // Copy to buffer
+    // Scale up to compensate for orthonormal normalization
+    // Old non-orthonormal DCT had no sqrt scaling, so output was ~sqrt(N/2) larger
+    // Scale factor: sqrt(DCT_SIZE / 2) = sqrt(256) = 16
+    //
+    // HOWEVER: After removing synthesis windowing (commit f998bfc), audio is louder.
+    // The old synthesis incorrectly applied Hamming window to spectrum (reducing energy by 0.63x).
+    // New synthesis is correct (no window), but procedural notes with 16x scaling are too loud.
+    //
+    // Analysis applies Hamming window (0.63x energy). With 16x scaling: 0.63 × 16 ≈ 10x.
+    // Divide by 2.5 to match the relative loudness increase: 16 / 2.5 = 6.4
+    const float scale_factor = sqrtf(DCT_SIZE / 2.0f) / 2.5f;
+
+    // Copy to buffer with scaling
     for (int i = 0; i < DCT_SIZE; ++i) {
-      spec_data[f * DCT_SIZE + i] = dct_chunk[i];
+      spec_data[f * DCT_SIZE + i] = dct_chunk[i] * scale_factor;
+    }
+  }
+
+  // Normalize to consistent RMS level (matching spectool --normalize behavior)
+  // 1. Synthesize PCM to measure actual output levels
+  std::vector<float> pcm_data(num_frames * DCT_SIZE);
+  for (int f = 0; f < num_frames; ++f) {
+    const float* spectral_frame = spec_data.data() + (f * DCT_SIZE);
+    float* time_frame = pcm_data.data() + (f * DCT_SIZE);
+    idct_512(spectral_frame, time_frame);
+  }
+
+  // 2. Calculate RMS and peak
+  float rms_sum = 0.0f;
+  float peak = 0.0f;
+  for (size_t i = 0; i < pcm_data.size(); ++i) {
+    const float abs_val = fabsf(pcm_data[i]);
+    if (abs_val > peak) {
+      peak = abs_val;
+    }
+    rms_sum += pcm_data[i] * pcm_data[i];
+  }
+  const float rms = sqrtf(rms_sum / pcm_data.size());
+
+  // 3. Normalize to target RMS (0.15, matching spectool default)
+  const float target_rms = 0.15f;
+  const float max_safe_peak = 1.0f; // Conservative: ensure output peak ≤ 1.0
+
+  if (rms > 1e-6f) {
+    // Calculate scale factor to reach target RMS
+    float norm_scale = target_rms / rms;
+
+    // Check if this would cause clipping
+    const float predicted_peak = peak * norm_scale;
+    if (predicted_peak > max_safe_peak) {
+      // Reduce scale to prevent clipping
+      norm_scale = max_safe_peak / peak;
+    }
+
+    // Apply normalization scale to spectrogram
+    for (size_t i = 0; i < spec_data.size(); ++i) {
+      spec_data[i] *= norm_scale;
     }
   }
 
diff --git a/src/audio/idct.cc b/src/audio/idct.cc
index f8e8769..4566f99 100644
--- a/src/audio/idct.cc
+++ b/src/audio/idct.cc
@@ -3,16 +3,9 @@
 // Used for real-time synthesis of audio from spectral data.
 
 #include "dct.h"
-#include <math.h>
+#include "fft.h"
 
+// Fast O(N log N) implementation using FFT
 void idct_512(const float* input, float* output) {
-  const float PI = 3.14159265358979323846f;
-  for (int n = 0; n < DCT_SIZE; ++n) {
-    float sum = input[0] / 2.0f;
-    for (int k = 1; k < DCT_SIZE; ++k) {
-      sum +=
-          input[k] * cosf(PI / (float)DCT_SIZE * (float)k * ((float)n + 0.5f));
-    }
-    output[n] = sum * (2.0f / (float)DCT_SIZE);
-  }
+  idct_fft(input, output, DCT_SIZE);
 }
diff --git a/src/audio/synth.cc b/src/audio/synth.cc
index ada46fd..2072bb4 100644
--- a/src/audio/synth.cc
+++ b/src/audio/synth.cc
@@ -41,9 +41,8 @@ static struct {
 
 static Voice g_voices[MAX_VOICES];
 static volatile float g_current_output_peak =
-    0.0f;                                   // Global peak for visualization
-static float g_hamming_window[WINDOW_SIZE]; // Static window for optimization
-static float g_tempo_scale = 1.0f;          // Playback speed multiplier
+    0.0f;                       // Global peak for visualization
+static float g_tempo_scale = 1.0f; // Playback speed multiplier
 
 #if !defined(STRIP_ALL)
 static float g_elapsed_time_sec = 0.0f; // Tracks elapsed time for event hooks
@@ -56,8 +55,6 @@ void synth_init() {
 #if !defined(STRIP_ALL)
   g_elapsed_time_sec = 0.0f;
 #endif /* !defined(STRIP_ALL) */
-  // Initialize the Hamming window once
-  hamming_window_512(g_hamming_window);
 }
 
 void synth_shutdown() {
@@ -214,10 +211,6 @@ void synth_trigger_voice(int spectrogram_id, float volume, float pan) {
 }
 
 void synth_render(float* output_buffer, int num_frames) {
-  // Use the pre-calculated window
-  // float window[WINDOW_SIZE];
-  // hamming_window_512(window);
-
   // Faster decay for more responsive visuals
   g_current_output_peak *= 0.90f;
 
@@ -243,13 +236,9 @@ void synth_render(float* output_buffer, int num_frames) {
         const float* spectral_frame = (const float*)v.active_spectral_data +
                                       (v.current_spectral_frame * DCT_SIZE);
 
-        float windowed_frame[DCT_SIZE];
-        for (int j = 0; j < DCT_SIZE; ++j) {
-          windowed_frame[j] =
-              spectral_frame[j] * g_hamming_window[j]; // Use static window
-        }
-
-        idct_512(windowed_frame, v.time_domain_buffer);
+        // IDCT directly - no windowing needed for synthesis
+        // (Window is only used during analysis, before DCT)
+        idct_512(spectral_frame, v.time_domain_buffer);
 
         v.buffer_pos = 0;
         ++v.current_spectral_frame;