feat(spectool): add --wav decode, IMDCT, and roundtrip test

- spectool --wav <input.spec> <output.wav>: decodes .spec to mono 16-bit WAV at 32 kHz using IDCT-OLA synthesis (no synthesis window). The analysis Hann window at 50% overlap satisfies w[n]+w[n+H]=1, so the synthesis window must be rectangular for perfect reconstruction. - Add imdct_512 / imdct_fft to audio lib (fft.cc, fft.h, idct.cc, dct.h) for future MDCT-based synthesis. - test_wav_roundtrip: in-process OLA analyze+decode SNR test (≥30 dB). Currently measures 53 dB on a 440 Hz sine. - Fix stale test_spectool.cc: version assertion updated from 1 to SPEC_VERSION_V2_OLA (was always wrong since OLA fix landed). - Docs: TOOLS_REFERENCE.md removes dead specview, documents --wav / --normalize / test_gen. HOWTO.md adds decode section. TRACKER.md notes spec v2 OLA format and decode command. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: skal <pascal.massimino@gmail.com> 2026-03-05 21:50:53 +0100
committer: skal <pascal.massimino@gmail.com> 2026-03-05 21:50:53 +0100
commit: 2f8926f433248af28081497e8371e02abe61d6ff (patch)
tree: 30e480325e2b7f01947a5ca2f8b3865e600d8bb7
parent: e2c3c3e95b6a9e53b4631b271640bb9914f8c95e (diff)
11 files changed, 303 insertions, 14 deletions
diff --git a/cmake/DemoTests.cmake b/cmake/DemoTests.cmake
index dc88973..26bce8c 100644
--- a/cmake/DemoTests.cmake
+++ b/cmake/DemoTests.cmake
@@ -18,6 +18,10 @@ add_demo_test(test_dct DctTest audio src/tests/audio/test_dct.cc ${GEN_DEMO_CC})
 target_link_libraries(test_dct PRIVATE audio util procedural ${DEMO_LIBS})
 demo_add_asset_deps(test_dct audio)
 
+add_demo_test(test_wav_roundtrip WavRoundtripTest audio src/tests/audio/test_wav_roundtrip.cc ${GEN_DEMO_CC})
+target_link_libraries(test_wav_roundtrip PRIVATE audio util procedural ${DEMO_LIBS})
+demo_add_asset_deps(test_wav_roundtrip audio)
+
 add_demo_test(test_fft FftTest audio src/tests/audio/test_fft.cc ${GEN_DEMO_CC})
 target_link_libraries(test_fft PRIVATE audio util procedural ${DEMO_LIBS})
 demo_add_asset_deps(test_fft audio)
diff --git a/doc/HOWTO.md b/doc/HOWTO.md
index 768e51d..cc43e7c 100644
--- a/doc/HOWTO.md
+++ b/doc/HOWTO.md
@@ -257,6 +257,17 @@ cmake --build build -j4 --target spectool
 
 Requires `ffmpeg` for `.aif` input files. Output uses v2 format (OLA, Hann, hop=256).
 
+### Decoding .spec to WAV
+
+To verify or inspect a `.spec` file as audio:
+
+```bash
+./build/spectool --wav input.spec output.wav
+```
+
+Produces a mono 16-bit PCM WAV at 32 kHz using IDCT-OLA synthesis (no synthesis window —
+the analysis Hann window is its own reconstruction filter at 50% overlap).
+
 ---
 
 ## Assets
diff --git a/doc/TOOLS_REFERENCE.md b/doc/TOOLS_REFERENCE.md
index 61412a9..f99d213 100644
--- a/doc/TOOLS_REFERENCE.md
+++ b/doc/TOOLS_REFERENCE.md
@@ -26,24 +26,21 @@ Comprehensive reference for all developer tools in the project.
 cmake -S . -B build -DDEMO_BUILD_TOOLS=ON
 cmake --build build -j4
 
-# Analyze WAV → .spec
+# Analyze WAV/MP3 → .spec (v2 OLA format)
 ./build/spectool analyze input.wav output.spec
+./build/spectool analyze input.wav output.spec --normalize        # normalize to RMS 0.15
+./build/spectool analyze input.wav output.spec --normalize 0.20   # custom RMS target
 
-# Play .spec file
-./build/spectool play input.spec
-```
-
----
+# Decode .spec → mono 16-bit WAV (32 kHz, IDCT-OLA synthesis)
+./build/spectool --wav input.spec output.wav
 
-## specview (Visualization)
+# Play .spec file via audio device
+./build/spectool play input.spec
 
-```bash
-# View spectrogram
-./build/specview input.spec
+# Generate test spectrogram (C major scale)
+./build/spectool test_gen output.spec
 ```
 
-Displays spectrogram visualization.
-
 ---
 
 ## specplay (Diagnostic)
diff --git a/doc/TRACKER.md b/doc/TRACKER.md
index 53d8338..e2c21aa 100644
--- a/doc/TRACKER.md
+++ b/doc/TRACKER.md
@@ -24,7 +24,7 @@ HUMANIZE SEED <int> TIMING <pct> VOLUME <pct>  # Optional humanization
 # Generated samples:
 SAMPLE <name>, <freq>, <dur>, <amp>, <attack>, <harmonics>, <decay> [OFFSET <sec>]
 
-# Asset samples (.spec):
+# Asset samples (.spec v2 OLA format — generated by spectool analyze):
 SAMPLE <asset_id> [OFFSET <sec>]         # ASSET_* from assets.txt
 
 # Asset samples (MP3, non-STRIP_ALL only):
@@ -32,6 +32,10 @@ SAMPLE <asset_id> [OFFSET <sec>]         # ASSET_* from assets.txt
 # Add to assets.txt: NAME, NONE, music/file.mp3, "description"
 SAMPLE ASSET_NAME [OFFSET <sec>]         # Decoded at init; same syntax as .spec
 
+# .spec format: SPEC_VERSION_V2_OLA (version=2). Synthesis uses IDCT-OLA
+# (Hann analysis window, 50% overlap, hop=256 samples at 32 kHz).
+# Decode a .spec to WAV for inspection: spectool --wav input.spec out.wav
+
 # Auto-generated notes (no SAMPLE declaration needed):
 # NOTE_C4, NOTE_A#3, NOTE_Eb2, etc.
 
diff --git a/src/audio/dct.h b/src/audio/dct.h
index ec9f651..ca423c2 100644
--- a/src/audio/dct.h
+++ b/src/audio/dct.h
@@ -11,3 +11,6 @@
 // Forward declarations
 void fdct_512(const float* input, float* output);
 void idct_512(const float* input, float* output);
+// IMDCT: N=512 coefficients -> 2*DCT_SIZE=1024 samples.
+// Window with Hann(1024) and OLA with hop DCT_SIZE for perfect reconstruction.
+void imdct_512(const float* input, float* output);
diff --git a/src/audio/fft.cc b/src/audio/fft.cc
index 64d7b1a..6029454 100644
--- a/src/audio/fft.cc
+++ b/src/audio/fft.cc
@@ -140,6 +140,49 @@ void dct_fft(const float* input, float* output, size_t N) {
   delete[] imag;
 }
 
+// IMDCT via FFT
+// Produces 2N time-domain samples from N MDCT coefficients.
+// Formula: x[n] = (2/N) * sum_{k=0}^{N-1} X[k] * cos(pi*(2n+1+N)*(2k+1)/(2N))
+// When windowed (Hann, length 2N) and OLA'd with hop N, gives perfect reconstruction.
+void imdct_fft(const float* input, float* output, size_t N) {
+  const float PI = 3.14159265358979323846f;
+  const size_t M = 2 * N; // output length
+
+  float* real = new float[M];
+  float* imag = new float[M];
+
+  // Pre-multiply X[k] by exp(-j*pi*(2k+1)/(4N)), build 2N complex FFT input
+  // via standard IMDCT-via-FFT algorithm (N-point complex FFT)
+  for (size_t k = 0; k < N; k++) {
+    const float angle = -PI * (2.0f * k + 1.0f) / (4.0f * N);
+    real[k] = input[k] * cosf(angle);
+    imag[k] = input[k] * sinf(angle);
+  }
+  for (size_t k = N; k < M; k++) {
+    real[k] = 0.0f;
+    imag[k] = 0.0f;
+  }
+
+  // Inverse FFT of length 2N
+  bit_reverse_permute(real, imag, M);
+  fft_radix2(real, imag, M, -1);
+  const float scale = 1.0f / (float)M;
+  for (size_t i = 0; i < M; i++) {
+    real[i] *= scale;
+    imag[i] *= scale;
+  }
+
+  // Post-multiply by 2N * exp(-j*pi*(2n+1)/(4N)) and take real part, scale by 2/N
+  const float gain = 2.0f;
+  for (size_t n = 0; n < M; n++) {
+    const float angle = -PI * (2.0f * n + 1.0f) / (4.0f * N);
+    output[n] = gain * (real[n] * cosf(angle) - imag[n] * sinf(angle));
+  }
+
+  delete[] real;
+  delete[] imag;
+}
+
 // IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method
 // Reference: Numerical Recipes Chapter 12.3, Press et al.
 void idct_fft(const float* input, float* output, size_t N) {
diff --git a/src/audio/fft.h b/src/audio/fft.h
index 8c10afd..df37ad5 100644
--- a/src/audio/fft.h
+++ b/src/audio/fft.h
@@ -32,4 +32,11 @@ void dct_fft(const float* input, float* output, size_t N);
 // N must be a power of 2
 void idct_fft(const float* input, float* output, size_t N);
 
+// IMDCT via FFT
+// Input: input[] (length N) — MDCT coefficients
+// Output: output[] (length 2*N) — time-domain samples
+// Window output with Hann(2N) and OLA with hop N for perfect reconstruction.
+// N must be a power of 2
+void imdct_fft(const float* input, float* output, size_t N);
+
 #endif /* AUDIO_FFT_H_ */
diff --git a/src/audio/idct.cc b/src/audio/idct.cc
index 4566f99..7a3c489 100644
--- a/src/audio/idct.cc
+++ b/src/audio/idct.cc
@@ -9,3 +9,7 @@
 void idct_512(const float* input, float* output) {
   idct_fft(input, output, DCT_SIZE);
 }
+
+void imdct_512(const float* input, float* output) {
+  imdct_fft(input, output, DCT_SIZE);
+}
diff --git a/src/tests/audio/test_wav_roundtrip.cc b/src/tests/audio/test_wav_roundtrip.cc
new file mode 100644
index 0000000..6294d6d
--- /dev/null
+++ b/src/tests/audio/test_wav_roundtrip.cc
@@ -0,0 +1,101 @@
+// Tests the wav->spec->wav roundtrip SNR.
+// Generates a sine wave, runs OLA-DCT analysis then IMDCT-OLA synthesis,
+// and asserts the reconstruction SNR exceeds the threshold.
+
+#include "audio/dct.h"
+#include "audio/window.h"
+#include <assert.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+static const int SAMPLE_RATE = 32000;
+static const float PI = 3.14159265358979323846f;
+
+// Replicate analyze_audio OLA pass (Hann + FDCT, hop = OLA_HOP_SIZE)
+static std::vector<float> ola_analyze(const std::vector<float>& pcm) {
+  float win[DCT_SIZE];
+  hann_window_512(win);
+
+  const int hop = OLA_HOP_SIZE;
+  const int n_pcm = (int)pcm.size();
+  const int num_frames = (n_pcm > DCT_SIZE) ? (n_pcm - DCT_SIZE) / hop + 1 : 1;
+
+  std::vector<float> spec(num_frames * DCT_SIZE);
+  float chunk[DCT_SIZE];
+
+  for (int f = 0; f < num_frames; ++f) {
+    const int start = f * hop;
+    const int avail = (start + DCT_SIZE <= n_pcm) ? DCT_SIZE : n_pcm - start;
+    for (int i = 0; i < avail; ++i) chunk[i] = pcm[start + i] * win[i];
+    for (int i = avail; i < DCT_SIZE; ++i) chunk[i] = 0.0f;
+
+    fdct_512(chunk, spec.data() + f * DCT_SIZE);
+  }
+  return spec;
+}
+
+// IDCT + OLA synthesis (no synthesis window) matching decode_to_wav.
+// Analysis used Hann; since Hann satisfies w[n]+w[n+H]=1 at 50% overlap,
+// skipping the synthesis window gives perfect reconstruction.
+static std::vector<float> ola_decode(const std::vector<float>& spec,
+                                     int num_frames) {
+  std::vector<float> pcm(num_frames * OLA_HOP_SIZE + OLA_OVERLAP, 0.0f);
+  float overlap[OLA_OVERLAP] = {};
+  float tmp[DCT_SIZE];
+
+  for (int f = 0; f < num_frames; ++f) {
+    idct_512(spec.data() + f * DCT_SIZE, tmp);
+    for (int j = 0; j < OLA_HOP_SIZE; ++j)
+      pcm[f * OLA_HOP_SIZE + j] = tmp[j] + overlap[j];
+    for (int j = 0; j < OLA_OVERLAP; ++j)
+      overlap[j] = tmp[OLA_HOP_SIZE + j];
+  }
+  pcm.resize(num_frames * OLA_HOP_SIZE);
+  return pcm;
+}
+
+static float compute_snr_db(const std::vector<float>& ref,
+                             const std::vector<float>& out,
+                             int skip_samples) {
+  const int n = (int)std::min(ref.size(), out.size());
+  double sig = 0.0, noise = 0.0;
+  for (int i = skip_samples; i < n; ++i) {
+    sig += (double)ref[i] * ref[i];
+    double e = ref[i] - out[i];
+    noise += e * e;
+  }
+  if (noise < 1e-30) return 999.0f;
+  return 10.0f * (float)log10(sig / noise);
+}
+
+int main() {
+  printf("Running WAV roundtrip test...\n");
+
+  // 1-second 440 Hz sine at 32 kHz
+  const int n_samples = SAMPLE_RATE;
+  std::vector<float> input(n_samples);
+  for (int i = 0; i < n_samples; ++i)
+    input[i] = 0.5f * sinf(2.0f * PI * 440.0f * i / SAMPLE_RATE);
+
+  // Analyze
+  std::vector<float> spec = ola_analyze(input);
+  const int num_frames = (int)(spec.size() / DCT_SIZE);
+
+  // Decode with IDCT-OLA (no synthesis window)
+  std::vector<float> output = ola_decode(spec, num_frames);
+
+  // SNR — skip first DCT_SIZE samples (ramp-up transient)
+  const float snr = compute_snr_db(input, output, DCT_SIZE);
+  printf("Roundtrip SNR: %.1f dB  (frames=%d, out_samples=%zu)\n",
+         snr, num_frames, output.size());
+
+  const float MIN_SNR_DB = 30.0f;
+  if (snr < MIN_SNR_DB) {
+    printf("FAIL: SNR %.1f dB < %.0f dB threshold\n", snr, MIN_SNR_DB);
+    return 1;
+  }
+
+  printf("PASS\n");
+  return 0;
+}
diff --git a/src/tests/gpu/test_spectool.cc b/src/tests/gpu/test_spectool.cc
index 984322a..b90d236 100644
--- a/src/tests/gpu/test_spectool.cc
+++ b/src/tests/gpu/test_spectool.cc
@@ -3,6 +3,7 @@
 // Generates a test WAV, analyzes it, and verifies the resulting .spec file.
 
 #include "audio/audio.h"
+#include "audio/synth.h"
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
@@ -54,7 +55,7 @@ int main() {
   size_t read = fread(&header, sizeof(SpecHeader), 1, f);
   assert(read == 1);
   assert(strncmp(header.magic, "SPEC", 4) == 0);
-  assert(header.version == 1);
+  assert(header.version == SPEC_VERSION_V2_OLA);
   assert(header.dct_size == 512);
   assert(header.num_frames > 0);
 
diff --git a/tools/spectool.cc b/tools/spectool.cc
index 70bcae2..59a56b5 100644
--- a/tools/spectool.cc
+++ b/tools/spectool.cc
@@ -217,6 +217,110 @@ int analyze_audio(const char* in_path, const char* out_path, bool normalize,
   return 0;
 }
 
+static void write_wav_header(FILE* f, uint32_t num_samples, uint32_t sample_rate) {
+  const uint16_t num_channels = 1;
+  const uint16_t bits_per_sample = 16;
+  const uint32_t data_size = num_samples * num_channels * (bits_per_sample / 8);
+  const uint32_t byte_rate = sample_rate * num_channels * (bits_per_sample / 8);
+  const uint16_t block_align = num_channels * (bits_per_sample / 8);
+  const uint32_t riff_size = 36 + data_size;
+
+  fwrite("RIFF", 1, 4, f);
+  fwrite(&riff_size, 4, 1, f);
+  fwrite("WAVE", 1, 4, f);
+  fwrite("fmt ", 1, 4, f);
+  const uint32_t fmt_size = 16;
+  fwrite(&fmt_size, 4, 1, f);
+  const uint16_t audio_format = 1; // PCM
+  fwrite(&audio_format, 2, 1, f);
+  fwrite(&num_channels, 2, 1, f);
+  fwrite(&sample_rate, 4, 1, f);
+  fwrite(&byte_rate, 4, 1, f);
+  fwrite(&block_align, 2, 1, f);
+  fwrite(&bits_per_sample, 2, 1, f);
+  fwrite("data", 1, 4, f);
+  fwrite(&data_size, 4, 1, f);
+}
+
+int decode_to_wav(const char* in_path, const char* out_path) {
+  printf("Decoding %s -> %s\n", in_path, out_path);
+
+  FILE* f_in = fopen(in_path, "rb");
+  if (!f_in) {
+    printf("Error: Failed to open input file: %s\n", in_path);
+    return 1;
+  }
+
+  SpecHeader header;
+  if (fread(&header, sizeof(SpecHeader), 1, f_in) != 1 ||
+      strncmp(header.magic, "SPEC", 4) != 0) {
+    printf("Error: Invalid spectrogram file format.\n");
+    fclose(f_in);
+    return 1;
+  }
+
+  std::vector<float> spec_data(header.num_frames * header.dct_size);
+  fread(spec_data.data(), sizeof(float), spec_data.size(), f_in);
+  fclose(f_in);
+
+  const bool ola_mode = (header.version == SPEC_VERSION_V2_OLA);
+  const uint32_t sample_rate = 32000;
+
+  std::vector<float> pcm;
+
+  if (ola_mode) {
+    // IDCT + OLA (no synthesis window).
+    // Analysis: Hann * FDCT. Since Hann at 50% overlap satisfies
+    // w[n] + w[n+HOP] = 1, a rectangular synthesis window gives
+    // perfect reconstruction: output[n] = IDCT(X_k)[j] + IDCT(X_{k-1})[j+HOP]
+    //                                    = x[n]*w[j] + x[n]*w[j+HOP] = x[n].
+    const uint32_t total_samples = (uint32_t)header.num_frames * OLA_HOP_SIZE;
+    pcm.assign(total_samples + OLA_OVERLAP, 0.0f);
+
+    float overlap[OLA_OVERLAP] = {};
+    for (int f = 0; f < header.num_frames; ++f) {
+      float tmp[DCT_SIZE];
+      idct_512(spec_data.data() + f * DCT_SIZE, tmp);
+      // First half: output samples for this frame
+      for (int j = 0; j < OLA_HOP_SIZE; ++j)
+        pcm[f * OLA_HOP_SIZE + j] = tmp[j] + overlap[j];
+      // Second half: save as overlap for next frame
+      for (int j = 0; j < OLA_OVERLAP; ++j)
+        overlap[j] = tmp[OLA_HOP_SIZE + j];
+    }
+    pcm.resize(total_samples);
+  } else {
+    const uint32_t total_samples = (uint32_t)header.num_frames * DCT_SIZE;
+    pcm.resize(total_samples);
+    for (int f = 0; f < header.num_frames; ++f) {
+      idct_512(spec_data.data() + f * DCT_SIZE, pcm.data() + f * DCT_SIZE);
+    }
+  }
+
+  // Write WAV
+  FILE* f_out = fopen(out_path, "wb");
+  if (!f_out) {
+    printf("Error: Failed to open output file: %s\n", out_path);
+    return 1;
+  }
+
+  const uint32_t out_samples = (uint32_t)pcm.size();
+  write_wav_header(f_out, out_samples, sample_rate);
+
+  for (uint32_t i = 0; i < out_samples; ++i) {
+    float s = pcm[i];
+    if (s >  1.0f) s =  1.0f;
+    if (s < -1.0f) s = -1.0f;
+    int16_t sample = (int16_t)(s * 32767.0f);
+    fwrite(&sample, sizeof(int16_t), 1, f_out);
+  }
+  fclose(f_out);
+
+  printf("Decoded %d frames (%u samples) at %u Hz.\n",
+         header.num_frames, out_samples, sample_rate);
+  return 0;
+}
+
 int play_spec(const char* in_path) {
   printf("Playing %s\n", in_path);
 
@@ -334,6 +438,9 @@ void print_usage() {
   printf(
       "  test_gen <output.spec>                   Generate a test "
       "spectrogram.\n");
+  printf(
+      "  --wav   <input.spec> <output.wav>        Decode spectrogram to mono "
+      "WAV.\n");
   printf("\nOptions for 'analyze':\n");
   printf(
       "  --normalize [rms]   Normalize audio to target RMS level (default: "
@@ -392,6 +499,13 @@ int main(int argc, char** argv) {
       return 1;
     }
     return test_gen(argv[2]);
+  } else if (strcmp(command, "--wav") == 0) {
+    if (argc < 4) {
+      printf("Error: '--wav' requires input .spec and output .wav files.\n");
+      print_usage();
+      return 1;
+    }
+    return decode_to_wav(argv[2], argv[3]);
   } else {
     printf("Error: Unknown command '%s'\n", command);
     print_usage();
author	skal <pascal.massimino@gmail.com>	2026-03-05 21:50:53 +0100
committer	skal <pascal.massimino@gmail.com>	2026-03-05 21:50:53 +0100
commit	2f8926f433248af28081497e8371e02abe61d6ff (patch)
tree	30e480325e2b7f01947a5ca2f8b3865e600d8bb7
parent	e2c3c3e95b6a9e53b4631b271640bb9914f8c95e (diff)