diff options
| author | skal <pascal.massimino@gmail.com> | 2026-03-05 21:50:53 +0100 |
|---|---|---|
| committer | skal <pascal.massimino@gmail.com> | 2026-03-05 21:50:53 +0100 |
| commit | 2f8926f433248af28081497e8371e02abe61d6ff (patch) | |
| tree | 30e480325e2b7f01947a5ca2f8b3865e600d8bb7 /src | |
| parent | e2c3c3e95b6a9e53b4631b271640bb9914f8c95e (diff) | |
feat(spectool): add --wav decode, IMDCT, and roundtrip test
- spectool --wav <input.spec> <output.wav>: decodes .spec to mono 16-bit
WAV at 32 kHz using IDCT-OLA synthesis (no synthesis window).
The analysis Hann window at 50% overlap satisfies w[n]+w[n+H]=1,
so the synthesis window must be rectangular for perfect reconstruction.
- Add imdct_512 / imdct_fft to audio lib (fft.cc, fft.h, idct.cc, dct.h)
for future MDCT-based synthesis.
- test_wav_roundtrip: in-process OLA analyze+decode SNR test (≥30 dB).
Currently measures 53 dB on a 440 Hz sine.
- Fix stale test_spectool.cc: version assertion updated from 1 to
SPEC_VERSION_V2_OLA (was always wrong since OLA fix landed).
- Docs: TOOLS_REFERENCE.md removes dead specview, documents --wav /
--normalize / test_gen. HOWTO.md adds decode section. TRACKER.md
notes spec v2 OLA format and decode command.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'src')
| -rw-r--r-- | src/audio/dct.h | 3 | ||||
| -rw-r--r-- | src/audio/fft.cc | 43 | ||||
| -rw-r--r-- | src/audio/fft.h | 7 | ||||
| -rw-r--r-- | src/audio/idct.cc | 4 | ||||
| -rw-r--r-- | src/tests/audio/test_wav_roundtrip.cc | 101 | ||||
| -rw-r--r-- | src/tests/gpu/test_spectool.cc | 3 |
6 files changed, 160 insertions, 1 deletions
diff --git a/src/audio/dct.h b/src/audio/dct.h index ec9f651..ca423c2 100644 --- a/src/audio/dct.h +++ b/src/audio/dct.h @@ -11,3 +11,6 @@ // Forward declarations void fdct_512(const float* input, float* output); void idct_512(const float* input, float* output); +// IMDCT: N=512 coefficients -> 2*DCT_SIZE=1024 samples. +// Window with Hann(1024) and OLA with hop DCT_SIZE for perfect reconstruction. +void imdct_512(const float* input, float* output); diff --git a/src/audio/fft.cc b/src/audio/fft.cc index 64d7b1a..6029454 100644 --- a/src/audio/fft.cc +++ b/src/audio/fft.cc @@ -140,6 +140,49 @@ void dct_fft(const float* input, float* output, size_t N) { delete[] imag; } +// IMDCT via FFT +// Produces 2N time-domain samples from N MDCT coefficients. +// Formula: x[n] = (2/N) * sum_{k=0}^{N-1} X[k] * cos(pi*(2n+1+N)*(2k+1)/(2N)) +// When windowed (Hann, length 2N) and OLA'd with hop N, gives perfect reconstruction. +void imdct_fft(const float* input, float* output, size_t N) { + const float PI = 3.14159265358979323846f; + const size_t M = 2 * N; // output length + + float* real = new float[M]; + float* imag = new float[M]; + + // Pre-multiply X[k] by exp(-j*pi*(2k+1)/(4N)), build 2N complex FFT input + // via standard IMDCT-via-FFT algorithm (N-point complex FFT) + for (size_t k = 0; k < N; k++) { + const float angle = -PI * (2.0f * k + 1.0f) / (4.0f * N); + real[k] = input[k] * cosf(angle); + imag[k] = input[k] * sinf(angle); + } + for (size_t k = N; k < M; k++) { + real[k] = 0.0f; + imag[k] = 0.0f; + } + + // Inverse FFT of length 2N + bit_reverse_permute(real, imag, M); + fft_radix2(real, imag, M, -1); + const float scale = 1.0f / (float)M; + for (size_t i = 0; i < M; i++) { + real[i] *= scale; + imag[i] *= scale; + } + + // Post-multiply by 2N * exp(-j*pi*(2n+1)/(4N)) and take real part, scale by 2/N + const float gain = 2.0f; + for (size_t n = 0; n < M; n++) { + const float angle = -PI * (2.0f * n + 1.0f) / (4.0f * N); + output[n] = gain * (real[n] * cosf(angle) - imag[n] * sinf(angle)); + } + + delete[] real; + delete[] imag; +} + // IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method // Reference: Numerical Recipes Chapter 12.3, Press et al. void idct_fft(const float* input, float* output, size_t N) { diff --git a/src/audio/fft.h b/src/audio/fft.h index 8c10afd..df37ad5 100644 --- a/src/audio/fft.h +++ b/src/audio/fft.h @@ -32,4 +32,11 @@ void dct_fft(const float* input, float* output, size_t N); // N must be a power of 2 void idct_fft(const float* input, float* output, size_t N); +// IMDCT via FFT +// Input: input[] (length N) — MDCT coefficients +// Output: output[] (length 2*N) — time-domain samples +// Window output with Hann(2N) and OLA with hop N for perfect reconstruction. +// N must be a power of 2 +void imdct_fft(const float* input, float* output, size_t N); + #endif /* AUDIO_FFT_H_ */ diff --git a/src/audio/idct.cc b/src/audio/idct.cc index 4566f99..7a3c489 100644 --- a/src/audio/idct.cc +++ b/src/audio/idct.cc @@ -9,3 +9,7 @@ void idct_512(const float* input, float* output) { idct_fft(input, output, DCT_SIZE); } + +void imdct_512(const float* input, float* output) { + imdct_fft(input, output, DCT_SIZE); +} diff --git a/src/tests/audio/test_wav_roundtrip.cc b/src/tests/audio/test_wav_roundtrip.cc new file mode 100644 index 0000000..6294d6d --- /dev/null +++ b/src/tests/audio/test_wav_roundtrip.cc @@ -0,0 +1,101 @@ +// Tests the wav->spec->wav roundtrip SNR. +// Generates a sine wave, runs OLA-DCT analysis then IMDCT-OLA synthesis, +// and asserts the reconstruction SNR exceeds the threshold. + +#include "audio/dct.h" +#include "audio/window.h" +#include <assert.h> +#include <cmath> +#include <cstdio> +#include <vector> + +static const int SAMPLE_RATE = 32000; +static const float PI = 3.14159265358979323846f; + +// Replicate analyze_audio OLA pass (Hann + FDCT, hop = OLA_HOP_SIZE) +static std::vector<float> ola_analyze(const std::vector<float>& pcm) { + float win[DCT_SIZE]; + hann_window_512(win); + + const int hop = OLA_HOP_SIZE; + const int n_pcm = (int)pcm.size(); + const int num_frames = (n_pcm > DCT_SIZE) ? (n_pcm - DCT_SIZE) / hop + 1 : 1; + + std::vector<float> spec(num_frames * DCT_SIZE); + float chunk[DCT_SIZE]; + + for (int f = 0; f < num_frames; ++f) { + const int start = f * hop; + const int avail = (start + DCT_SIZE <= n_pcm) ? DCT_SIZE : n_pcm - start; + for (int i = 0; i < avail; ++i) chunk[i] = pcm[start + i] * win[i]; + for (int i = avail; i < DCT_SIZE; ++i) chunk[i] = 0.0f; + + fdct_512(chunk, spec.data() + f * DCT_SIZE); + } + return spec; +} + +// IDCT + OLA synthesis (no synthesis window) matching decode_to_wav. +// Analysis used Hann; since Hann satisfies w[n]+w[n+H]=1 at 50% overlap, +// skipping the synthesis window gives perfect reconstruction. +static std::vector<float> ola_decode(const std::vector<float>& spec, + int num_frames) { + std::vector<float> pcm(num_frames * OLA_HOP_SIZE + OLA_OVERLAP, 0.0f); + float overlap[OLA_OVERLAP] = {}; + float tmp[DCT_SIZE]; + + for (int f = 0; f < num_frames; ++f) { + idct_512(spec.data() + f * DCT_SIZE, tmp); + for (int j = 0; j < OLA_HOP_SIZE; ++j) + pcm[f * OLA_HOP_SIZE + j] = tmp[j] + overlap[j]; + for (int j = 0; j < OLA_OVERLAP; ++j) + overlap[j] = tmp[OLA_HOP_SIZE + j]; + } + pcm.resize(num_frames * OLA_HOP_SIZE); + return pcm; +} + +static float compute_snr_db(const std::vector<float>& ref, + const std::vector<float>& out, + int skip_samples) { + const int n = (int)std::min(ref.size(), out.size()); + double sig = 0.0, noise = 0.0; + for (int i = skip_samples; i < n; ++i) { + sig += (double)ref[i] * ref[i]; + double e = ref[i] - out[i]; + noise += e * e; + } + if (noise < 1e-30) return 999.0f; + return 10.0f * (float)log10(sig / noise); +} + +int main() { + printf("Running WAV roundtrip test...\n"); + + // 1-second 440 Hz sine at 32 kHz + const int n_samples = SAMPLE_RATE; + std::vector<float> input(n_samples); + for (int i = 0; i < n_samples; ++i) + input[i] = 0.5f * sinf(2.0f * PI * 440.0f * i / SAMPLE_RATE); + + // Analyze + std::vector<float> spec = ola_analyze(input); + const int num_frames = (int)(spec.size() / DCT_SIZE); + + // Decode with IDCT-OLA (no synthesis window) + std::vector<float> output = ola_decode(spec, num_frames); + + // SNR — skip first DCT_SIZE samples (ramp-up transient) + const float snr = compute_snr_db(input, output, DCT_SIZE); + printf("Roundtrip SNR: %.1f dB (frames=%d, out_samples=%zu)\n", + snr, num_frames, output.size()); + + const float MIN_SNR_DB = 30.0f; + if (snr < MIN_SNR_DB) { + printf("FAIL: SNR %.1f dB < %.0f dB threshold\n", snr, MIN_SNR_DB); + return 1; + } + + printf("PASS\n"); + return 0; +} diff --git a/src/tests/gpu/test_spectool.cc b/src/tests/gpu/test_spectool.cc index 984322a..b90d236 100644 --- a/src/tests/gpu/test_spectool.cc +++ b/src/tests/gpu/test_spectool.cc @@ -3,6 +3,7 @@ // Generates a test WAV, analyzes it, and verifies the resulting .spec file. #include "audio/audio.h" +#include "audio/synth.h" #include <assert.h> #include <math.h> #include <stdio.h> @@ -54,7 +55,7 @@ int main() { size_t read = fread(&header, sizeof(SpecHeader), 1, f); assert(read == 1); assert(strncmp(header.magic, "SPEC", 4) == 0); - assert(header.version == 1); + assert(header.version == SPEC_VERSION_V2_OLA); assert(header.dct_size == 512); assert(header.num_frames > 0); |
