summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorskal <pascal.massimino@gmail.com>2026-03-05 21:50:53 +0100
committerskal <pascal.massimino@gmail.com>2026-03-05 21:50:53 +0100
commit2f8926f433248af28081497e8371e02abe61d6ff (patch)
tree30e480325e2b7f01947a5ca2f8b3865e600d8bb7 /src
parente2c3c3e95b6a9e53b4631b271640bb9914f8c95e (diff)
feat(spectool): add --wav decode, IMDCT, and roundtrip test
- spectool --wav <input.spec> <output.wav>: decodes .spec to mono 16-bit WAV at 32 kHz using IDCT-OLA synthesis (no synthesis window). The analysis Hann window at 50% overlap satisfies w[n]+w[n+H]=1, so the synthesis window must be rectangular for perfect reconstruction. - Add imdct_512 / imdct_fft to audio lib (fft.cc, fft.h, idct.cc, dct.h) for future MDCT-based synthesis. - test_wav_roundtrip: in-process OLA analyze+decode SNR test (≥30 dB). Currently measures 53 dB on a 440 Hz sine. - Fix stale test_spectool.cc: version assertion updated from 1 to SPEC_VERSION_V2_OLA (was always wrong since OLA fix landed). - Docs: TOOLS_REFERENCE.md removes dead specview, documents --wav / --normalize / test_gen. HOWTO.md adds decode section. TRACKER.md notes spec v2 OLA format and decode command. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'src')
-rw-r--r--src/audio/dct.h3
-rw-r--r--src/audio/fft.cc43
-rw-r--r--src/audio/fft.h7
-rw-r--r--src/audio/idct.cc4
-rw-r--r--src/tests/audio/test_wav_roundtrip.cc101
-rw-r--r--src/tests/gpu/test_spectool.cc3
6 files changed, 160 insertions, 1 deletions
diff --git a/src/audio/dct.h b/src/audio/dct.h
index ec9f651..ca423c2 100644
--- a/src/audio/dct.h
+++ b/src/audio/dct.h
@@ -11,3 +11,6 @@
// Forward declarations
void fdct_512(const float* input, float* output);
void idct_512(const float* input, float* output);
+// IMDCT: N=512 coefficients -> 2*DCT_SIZE=1024 samples.
+// Window with Hann(1024) and OLA with hop DCT_SIZE for perfect reconstruction.
+void imdct_512(const float* input, float* output);
diff --git a/src/audio/fft.cc b/src/audio/fft.cc
index 64d7b1a..6029454 100644
--- a/src/audio/fft.cc
+++ b/src/audio/fft.cc
@@ -140,6 +140,49 @@ void dct_fft(const float* input, float* output, size_t N) {
delete[] imag;
}
+// IMDCT via FFT
+// Produces 2N time-domain samples from N MDCT coefficients.
+// Formula: x[n] = (2/N) * sum_{k=0}^{N-1} X[k] * cos(pi*(2n+1+N)*(2k+1)/(2N))
+// When windowed (Hann, length 2N) and OLA'd with hop N, gives perfect reconstruction.
+void imdct_fft(const float* input, float* output, size_t N) {
+ const float PI = 3.14159265358979323846f;
+ const size_t M = 2 * N; // output length
+
+ float* real = new float[M];
+ float* imag = new float[M];
+
+ // Pre-multiply X[k] by exp(-j*pi*(2k+1)/(4N)), build 2N complex FFT input
+ // via standard IMDCT-via-FFT algorithm (N-point complex FFT)
+ for (size_t k = 0; k < N; k++) {
+ const float angle = -PI * (2.0f * k + 1.0f) / (4.0f * N);
+ real[k] = input[k] * cosf(angle);
+ imag[k] = input[k] * sinf(angle);
+ }
+ for (size_t k = N; k < M; k++) {
+ real[k] = 0.0f;
+ imag[k] = 0.0f;
+ }
+
+ // Inverse FFT of length 2N
+ bit_reverse_permute(real, imag, M);
+ fft_radix2(real, imag, M, -1);
+ const float scale = 1.0f / (float)M;
+ for (size_t i = 0; i < M; i++) {
+ real[i] *= scale;
+ imag[i] *= scale;
+ }
+
+ // Post-multiply by 2N * exp(-j*pi*(2n+1)/(4N)) and take real part, scale by 2/N
+ const float gain = 2.0f;
+ for (size_t n = 0; n < M; n++) {
+ const float angle = -PI * (2.0f * n + 1.0f) / (4.0f * N);
+ output[n] = gain * (real[n] * cosf(angle) - imag[n] * sinf(angle));
+ }
+
+ delete[] real;
+ delete[] imag;
+}
+
// IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method
// Reference: Numerical Recipes Chapter 12.3, Press et al.
void idct_fft(const float* input, float* output, size_t N) {
diff --git a/src/audio/fft.h b/src/audio/fft.h
index 8c10afd..df37ad5 100644
--- a/src/audio/fft.h
+++ b/src/audio/fft.h
@@ -32,4 +32,11 @@ void dct_fft(const float* input, float* output, size_t N);
// N must be a power of 2
void idct_fft(const float* input, float* output, size_t N);
+// IMDCT via FFT
+// Input: input[] (length N) — MDCT coefficients
+// Output: output[] (length 2*N) — time-domain samples
+// Window output with Hann(2N) and OLA with hop N for perfect reconstruction.
+// N must be a power of 2
+void imdct_fft(const float* input, float* output, size_t N);
+
#endif /* AUDIO_FFT_H_ */
diff --git a/src/audio/idct.cc b/src/audio/idct.cc
index 4566f99..7a3c489 100644
--- a/src/audio/idct.cc
+++ b/src/audio/idct.cc
@@ -9,3 +9,7 @@
void idct_512(const float* input, float* output) {
idct_fft(input, output, DCT_SIZE);
}
+
+void imdct_512(const float* input, float* output) {
+ imdct_fft(input, output, DCT_SIZE);
+}
diff --git a/src/tests/audio/test_wav_roundtrip.cc b/src/tests/audio/test_wav_roundtrip.cc
new file mode 100644
index 0000000..6294d6d
--- /dev/null
+++ b/src/tests/audio/test_wav_roundtrip.cc
@@ -0,0 +1,101 @@
+// Tests the wav->spec->wav roundtrip SNR.
+// Generates a sine wave, runs OLA-DCT analysis then IMDCT-OLA synthesis,
+// and asserts the reconstruction SNR exceeds the threshold.
+
+#include "audio/dct.h"
+#include "audio/window.h"
+#include <assert.h>
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+static const int SAMPLE_RATE = 32000;
+static const float PI = 3.14159265358979323846f;
+
+// Replicate analyze_audio OLA pass (Hann + FDCT, hop = OLA_HOP_SIZE)
+static std::vector<float> ola_analyze(const std::vector<float>& pcm) {
+ float win[DCT_SIZE];
+ hann_window_512(win);
+
+ const int hop = OLA_HOP_SIZE;
+ const int n_pcm = (int)pcm.size();
+ const int num_frames = (n_pcm > DCT_SIZE) ? (n_pcm - DCT_SIZE) / hop + 1 : 1;
+
+ std::vector<float> spec(num_frames * DCT_SIZE);
+ float chunk[DCT_SIZE];
+
+ for (int f = 0; f < num_frames; ++f) {
+ const int start = f * hop;
+ const int avail = (start + DCT_SIZE <= n_pcm) ? DCT_SIZE : n_pcm - start;
+ for (int i = 0; i < avail; ++i) chunk[i] = pcm[start + i] * win[i];
+ for (int i = avail; i < DCT_SIZE; ++i) chunk[i] = 0.0f;
+
+ fdct_512(chunk, spec.data() + f * DCT_SIZE);
+ }
+ return spec;
+}
+
+// IDCT + OLA synthesis (no synthesis window) matching decode_to_wav.
+// Analysis used Hann; since Hann satisfies w[n]+w[n+H]=1 at 50% overlap,
+// skipping the synthesis window gives perfect reconstruction.
+static std::vector<float> ola_decode(const std::vector<float>& spec,
+ int num_frames) {
+ std::vector<float> pcm(num_frames * OLA_HOP_SIZE + OLA_OVERLAP, 0.0f);
+ float overlap[OLA_OVERLAP] = {};
+ float tmp[DCT_SIZE];
+
+ for (int f = 0; f < num_frames; ++f) {
+ idct_512(spec.data() + f * DCT_SIZE, tmp);
+ for (int j = 0; j < OLA_HOP_SIZE; ++j)
+ pcm[f * OLA_HOP_SIZE + j] = tmp[j] + overlap[j];
+ for (int j = 0; j < OLA_OVERLAP; ++j)
+ overlap[j] = tmp[OLA_HOP_SIZE + j];
+ }
+ pcm.resize(num_frames * OLA_HOP_SIZE);
+ return pcm;
+}
+
+static float compute_snr_db(const std::vector<float>& ref,
+ const std::vector<float>& out,
+ int skip_samples) {
+ const int n = (int)std::min(ref.size(), out.size());
+ double sig = 0.0, noise = 0.0;
+ for (int i = skip_samples; i < n; ++i) {
+ sig += (double)ref[i] * ref[i];
+ double e = ref[i] - out[i];
+ noise += e * e;
+ }
+ if (noise < 1e-30) return 999.0f;
+ return 10.0f * (float)log10(sig / noise);
+}
+
+int main() {
+ printf("Running WAV roundtrip test...\n");
+
+ // 1-second 440 Hz sine at 32 kHz
+ const int n_samples = SAMPLE_RATE;
+ std::vector<float> input(n_samples);
+ for (int i = 0; i < n_samples; ++i)
+ input[i] = 0.5f * sinf(2.0f * PI * 440.0f * i / SAMPLE_RATE);
+
+ // Analyze
+ std::vector<float> spec = ola_analyze(input);
+ const int num_frames = (int)(spec.size() / DCT_SIZE);
+
+ // Decode with IDCT-OLA (no synthesis window)
+ std::vector<float> output = ola_decode(spec, num_frames);
+
+ // SNR — skip first DCT_SIZE samples (ramp-up transient)
+ const float snr = compute_snr_db(input, output, DCT_SIZE);
+ printf("Roundtrip SNR: %.1f dB (frames=%d, out_samples=%zu)\n",
+ snr, num_frames, output.size());
+
+ const float MIN_SNR_DB = 30.0f;
+ if (snr < MIN_SNR_DB) {
+ printf("FAIL: SNR %.1f dB < %.0f dB threshold\n", snr, MIN_SNR_DB);
+ return 1;
+ }
+
+ printf("PASS\n");
+ return 0;
+}
diff --git a/src/tests/gpu/test_spectool.cc b/src/tests/gpu/test_spectool.cc
index 984322a..b90d236 100644
--- a/src/tests/gpu/test_spectool.cc
+++ b/src/tests/gpu/test_spectool.cc
@@ -3,6 +3,7 @@
// Generates a test WAV, analyzes it, and verifies the resulting .spec file.
#include "audio/audio.h"
+#include "audio/synth.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
@@ -54,7 +55,7 @@ int main() {
size_t read = fread(&header, sizeof(SpecHeader), 1, f);
assert(read == 1);
assert(strncmp(header.magic, "SPEC", 4) == 0);
- assert(header.version == 1);
+ assert(header.version == SPEC_VERSION_V2_OLA);
assert(header.dct_size == 512);
assert(header.num_frames > 0);