diff options
| author | skal <pascal.massimino@gmail.com> | 2026-03-05 21:50:53 +0100 |
|---|---|---|
| committer | skal <pascal.massimino@gmail.com> | 2026-03-05 21:50:53 +0100 |
| commit | 2f8926f433248af28081497e8371e02abe61d6ff (patch) | |
| tree | 30e480325e2b7f01947a5ca2f8b3865e600d8bb7 | |
| parent | e2c3c3e95b6a9e53b4631b271640bb9914f8c95e (diff) | |
feat(spectool): add --wav decode, IMDCT, and roundtrip test
- spectool --wav <input.spec> <output.wav>: decodes .spec to mono 16-bit
WAV at 32 kHz using IDCT-OLA synthesis (no synthesis window).
The analysis Hann window at 50% overlap satisfies w[n]+w[n+H]=1,
so the synthesis window must be rectangular for perfect reconstruction.
- Add imdct_512 / imdct_fft to audio lib (fft.cc, fft.h, idct.cc, dct.h)
for future MDCT-based synthesis.
- test_wav_roundtrip: in-process OLA analyze+decode SNR test (≥30 dB).
Currently measures 53 dB on a 440 Hz sine.
- Fix stale test_spectool.cc: version assertion updated from 1 to
SPEC_VERSION_V2_OLA (was always wrong since OLA fix landed).
- Docs: TOOLS_REFERENCE.md removes dead specview, documents --wav /
--normalize / test_gen. HOWTO.md adds decode section. TRACKER.md
notes spec v2 OLA format and decode command.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | cmake/DemoTests.cmake | 4 | ||||
| -rw-r--r-- | doc/HOWTO.md | 11 | ||||
| -rw-r--r-- | doc/TOOLS_REFERENCE.md | 21 | ||||
| -rw-r--r-- | doc/TRACKER.md | 6 | ||||
| -rw-r--r-- | src/audio/dct.h | 3 | ||||
| -rw-r--r-- | src/audio/fft.cc | 43 | ||||
| -rw-r--r-- | src/audio/fft.h | 7 | ||||
| -rw-r--r-- | src/audio/idct.cc | 4 | ||||
| -rw-r--r-- | src/tests/audio/test_wav_roundtrip.cc | 101 | ||||
| -rw-r--r-- | src/tests/gpu/test_spectool.cc | 3 | ||||
| -rw-r--r-- | tools/spectool.cc | 114 |
11 files changed, 303 insertions, 14 deletions
diff --git a/cmake/DemoTests.cmake b/cmake/DemoTests.cmake index dc88973..26bce8c 100644 --- a/cmake/DemoTests.cmake +++ b/cmake/DemoTests.cmake @@ -18,6 +18,10 @@ add_demo_test(test_dct DctTest audio src/tests/audio/test_dct.cc ${GEN_DEMO_CC}) target_link_libraries(test_dct PRIVATE audio util procedural ${DEMO_LIBS}) demo_add_asset_deps(test_dct audio) +add_demo_test(test_wav_roundtrip WavRoundtripTest audio src/tests/audio/test_wav_roundtrip.cc ${GEN_DEMO_CC}) +target_link_libraries(test_wav_roundtrip PRIVATE audio util procedural ${DEMO_LIBS}) +demo_add_asset_deps(test_wav_roundtrip audio) + add_demo_test(test_fft FftTest audio src/tests/audio/test_fft.cc ${GEN_DEMO_CC}) target_link_libraries(test_fft PRIVATE audio util procedural ${DEMO_LIBS}) demo_add_asset_deps(test_fft audio) diff --git a/doc/HOWTO.md b/doc/HOWTO.md index 768e51d..cc43e7c 100644 --- a/doc/HOWTO.md +++ b/doc/HOWTO.md @@ -257,6 +257,17 @@ cmake --build build -j4 --target spectool Requires `ffmpeg` for `.aif` input files. Output uses v2 format (OLA, Hann, hop=256). +### Decoding .spec to WAV + +To verify or inspect a `.spec` file as audio: + +```bash +./build/spectool --wav input.spec output.wav +``` + +Produces a mono 16-bit PCM WAV at 32 kHz using IDCT-OLA synthesis (no synthesis window — +the analysis Hann window is its own reconstruction filter at 50% overlap). + --- ## Assets diff --git a/doc/TOOLS_REFERENCE.md b/doc/TOOLS_REFERENCE.md index 61412a9..f99d213 100644 --- a/doc/TOOLS_REFERENCE.md +++ b/doc/TOOLS_REFERENCE.md @@ -26,24 +26,21 @@ Comprehensive reference for all developer tools in the project. cmake -S . -B build -DDEMO_BUILD_TOOLS=ON cmake --build build -j4 -# Analyze WAV → .spec +# Analyze WAV/MP3 → .spec (v2 OLA format) ./build/spectool analyze input.wav output.spec +./build/spectool analyze input.wav output.spec --normalize # normalize to RMS 0.15 +./build/spectool analyze input.wav output.spec --normalize 0.20 # custom RMS target -# Play .spec file -./build/spectool play input.spec -``` - ---- +# Decode .spec → mono 16-bit WAV (32 kHz, IDCT-OLA synthesis) +./build/spectool --wav input.spec output.wav -## specview (Visualization) +# Play .spec file via audio device +./build/spectool play input.spec -```bash -# View spectrogram -./build/specview input.spec +# Generate test spectrogram (C major scale) +./build/spectool test_gen output.spec ``` -Displays spectrogram visualization. - --- ## specplay (Diagnostic) diff --git a/doc/TRACKER.md b/doc/TRACKER.md index 53d8338..e2c21aa 100644 --- a/doc/TRACKER.md +++ b/doc/TRACKER.md @@ -24,7 +24,7 @@ HUMANIZE SEED <int> TIMING <pct> VOLUME <pct> # Optional humanization # Generated samples: SAMPLE <name>, <freq>, <dur>, <amp>, <attack>, <harmonics>, <decay> [OFFSET <sec>] -# Asset samples (.spec): +# Asset samples (.spec v2 OLA format — generated by spectool analyze): SAMPLE <asset_id> [OFFSET <sec>] # ASSET_* from assets.txt # Asset samples (MP3, non-STRIP_ALL only): @@ -32,6 +32,10 @@ SAMPLE <asset_id> [OFFSET <sec>] # ASSET_* from assets.txt # Add to assets.txt: NAME, NONE, music/file.mp3, "description" SAMPLE ASSET_NAME [OFFSET <sec>] # Decoded at init; same syntax as .spec +# .spec format: SPEC_VERSION_V2_OLA (version=2). Synthesis uses IDCT-OLA +# (Hann analysis window, 50% overlap, hop=256 samples at 32 kHz). +# Decode a .spec to WAV for inspection: spectool --wav input.spec out.wav + # Auto-generated notes (no SAMPLE declaration needed): # NOTE_C4, NOTE_A#3, NOTE_Eb2, etc. diff --git a/src/audio/dct.h b/src/audio/dct.h index ec9f651..ca423c2 100644 --- a/src/audio/dct.h +++ b/src/audio/dct.h @@ -11,3 +11,6 @@ // Forward declarations void fdct_512(const float* input, float* output); void idct_512(const float* input, float* output); +// IMDCT: N=512 coefficients -> 2*DCT_SIZE=1024 samples. +// Window with Hann(1024) and OLA with hop DCT_SIZE for perfect reconstruction. +void imdct_512(const float* input, float* output); diff --git a/src/audio/fft.cc b/src/audio/fft.cc index 64d7b1a..6029454 100644 --- a/src/audio/fft.cc +++ b/src/audio/fft.cc @@ -140,6 +140,49 @@ void dct_fft(const float* input, float* output, size_t N) { delete[] imag; } +// IMDCT via FFT +// Produces 2N time-domain samples from N MDCT coefficients. +// Formula: x[n] = (2/N) * sum_{k=0}^{N-1} X[k] * cos(pi*(2n+1+N)*(2k+1)/(2N)) +// When windowed (Hann, length 2N) and OLA'd with hop N, gives perfect reconstruction. +void imdct_fft(const float* input, float* output, size_t N) { + const float PI = 3.14159265358979323846f; + const size_t M = 2 * N; // output length + + float* real = new float[M]; + float* imag = new float[M]; + + // Pre-multiply X[k] by exp(-j*pi*(2k+1)/(4N)), build 2N complex FFT input + // via standard IMDCT-via-FFT algorithm (N-point complex FFT) + for (size_t k = 0; k < N; k++) { + const float angle = -PI * (2.0f * k + 1.0f) / (4.0f * N); + real[k] = input[k] * cosf(angle); + imag[k] = input[k] * sinf(angle); + } + for (size_t k = N; k < M; k++) { + real[k] = 0.0f; + imag[k] = 0.0f; + } + + // Inverse FFT of length 2N + bit_reverse_permute(real, imag, M); + fft_radix2(real, imag, M, -1); + const float scale = 1.0f / (float)M; + for (size_t i = 0; i < M; i++) { + real[i] *= scale; + imag[i] *= scale; + } + + // Post-multiply by 2N * exp(-j*pi*(2n+1)/(4N)) and take real part, scale by 2/N + const float gain = 2.0f; + for (size_t n = 0; n < M; n++) { + const float angle = -PI * (2.0f * n + 1.0f) / (4.0f * N); + output[n] = gain * (real[n] * cosf(angle) - imag[n] * sinf(angle)); + } + + delete[] real; + delete[] imag; +} + // IDCT (DCT-III) via FFT - inverse of the DCT-II reordering method // Reference: Numerical Recipes Chapter 12.3, Press et al. void idct_fft(const float* input, float* output, size_t N) { diff --git a/src/audio/fft.h b/src/audio/fft.h index 8c10afd..df37ad5 100644 --- a/src/audio/fft.h +++ b/src/audio/fft.h @@ -32,4 +32,11 @@ void dct_fft(const float* input, float* output, size_t N); // N must be a power of 2 void idct_fft(const float* input, float* output, size_t N); +// IMDCT via FFT +// Input: input[] (length N) — MDCT coefficients +// Output: output[] (length 2*N) — time-domain samples +// Window output with Hann(2N) and OLA with hop N for perfect reconstruction. +// N must be a power of 2 +void imdct_fft(const float* input, float* output, size_t N); + #endif /* AUDIO_FFT_H_ */ diff --git a/src/audio/idct.cc b/src/audio/idct.cc index 4566f99..7a3c489 100644 --- a/src/audio/idct.cc +++ b/src/audio/idct.cc @@ -9,3 +9,7 @@ void idct_512(const float* input, float* output) { idct_fft(input, output, DCT_SIZE); } + +void imdct_512(const float* input, float* output) { + imdct_fft(input, output, DCT_SIZE); +} diff --git a/src/tests/audio/test_wav_roundtrip.cc b/src/tests/audio/test_wav_roundtrip.cc new file mode 100644 index 0000000..6294d6d --- /dev/null +++ b/src/tests/audio/test_wav_roundtrip.cc @@ -0,0 +1,101 @@ +// Tests the wav->spec->wav roundtrip SNR. +// Generates a sine wave, runs OLA-DCT analysis then IMDCT-OLA synthesis, +// and asserts the reconstruction SNR exceeds the threshold. + +#include "audio/dct.h" +#include "audio/window.h" +#include <assert.h> +#include <cmath> +#include <cstdio> +#include <vector> + +static const int SAMPLE_RATE = 32000; +static const float PI = 3.14159265358979323846f; + +// Replicate analyze_audio OLA pass (Hann + FDCT, hop = OLA_HOP_SIZE) +static std::vector<float> ola_analyze(const std::vector<float>& pcm) { + float win[DCT_SIZE]; + hann_window_512(win); + + const int hop = OLA_HOP_SIZE; + const int n_pcm = (int)pcm.size(); + const int num_frames = (n_pcm > DCT_SIZE) ? (n_pcm - DCT_SIZE) / hop + 1 : 1; + + std::vector<float> spec(num_frames * DCT_SIZE); + float chunk[DCT_SIZE]; + + for (int f = 0; f < num_frames; ++f) { + const int start = f * hop; + const int avail = (start + DCT_SIZE <= n_pcm) ? DCT_SIZE : n_pcm - start; + for (int i = 0; i < avail; ++i) chunk[i] = pcm[start + i] * win[i]; + for (int i = avail; i < DCT_SIZE; ++i) chunk[i] = 0.0f; + + fdct_512(chunk, spec.data() + f * DCT_SIZE); + } + return spec; +} + +// IDCT + OLA synthesis (no synthesis window) matching decode_to_wav. +// Analysis used Hann; since Hann satisfies w[n]+w[n+H]=1 at 50% overlap, +// skipping the synthesis window gives perfect reconstruction. +static std::vector<float> ola_decode(const std::vector<float>& spec, + int num_frames) { + std::vector<float> pcm(num_frames * OLA_HOP_SIZE + OLA_OVERLAP, 0.0f); + float overlap[OLA_OVERLAP] = {}; + float tmp[DCT_SIZE]; + + for (int f = 0; f < num_frames; ++f) { + idct_512(spec.data() + f * DCT_SIZE, tmp); + for (int j = 0; j < OLA_HOP_SIZE; ++j) + pcm[f * OLA_HOP_SIZE + j] = tmp[j] + overlap[j]; + for (int j = 0; j < OLA_OVERLAP; ++j) + overlap[j] = tmp[OLA_HOP_SIZE + j]; + } + pcm.resize(num_frames * OLA_HOP_SIZE); + return pcm; +} + +static float compute_snr_db(const std::vector<float>& ref, + const std::vector<float>& out, + int skip_samples) { + const int n = (int)std::min(ref.size(), out.size()); + double sig = 0.0, noise = 0.0; + for (int i = skip_samples; i < n; ++i) { + sig += (double)ref[i] * ref[i]; + double e = ref[i] - out[i]; + noise += e * e; + } + if (noise < 1e-30) return 999.0f; + return 10.0f * (float)log10(sig / noise); +} + +int main() { + printf("Running WAV roundtrip test...\n"); + + // 1-second 440 Hz sine at 32 kHz + const int n_samples = SAMPLE_RATE; + std::vector<float> input(n_samples); + for (int i = 0; i < n_samples; ++i) + input[i] = 0.5f * sinf(2.0f * PI * 440.0f * i / SAMPLE_RATE); + + // Analyze + std::vector<float> spec = ola_analyze(input); + const int num_frames = (int)(spec.size() / DCT_SIZE); + + // Decode with IDCT-OLA (no synthesis window) + std::vector<float> output = ola_decode(spec, num_frames); + + // SNR — skip first DCT_SIZE samples (ramp-up transient) + const float snr = compute_snr_db(input, output, DCT_SIZE); + printf("Roundtrip SNR: %.1f dB (frames=%d, out_samples=%zu)\n", + snr, num_frames, output.size()); + + const float MIN_SNR_DB = 30.0f; + if (snr < MIN_SNR_DB) { + printf("FAIL: SNR %.1f dB < %.0f dB threshold\n", snr, MIN_SNR_DB); + return 1; + } + + printf("PASS\n"); + return 0; +} diff --git a/src/tests/gpu/test_spectool.cc b/src/tests/gpu/test_spectool.cc index 984322a..b90d236 100644 --- a/src/tests/gpu/test_spectool.cc +++ b/src/tests/gpu/test_spectool.cc @@ -3,6 +3,7 @@ // Generates a test WAV, analyzes it, and verifies the resulting .spec file. #include "audio/audio.h" +#include "audio/synth.h" #include <assert.h> #include <math.h> #include <stdio.h> @@ -54,7 +55,7 @@ int main() { size_t read = fread(&header, sizeof(SpecHeader), 1, f); assert(read == 1); assert(strncmp(header.magic, "SPEC", 4) == 0); - assert(header.version == 1); + assert(header.version == SPEC_VERSION_V2_OLA); assert(header.dct_size == 512); assert(header.num_frames > 0); diff --git a/tools/spectool.cc b/tools/spectool.cc index 70bcae2..59a56b5 100644 --- a/tools/spectool.cc +++ b/tools/spectool.cc @@ -217,6 +217,110 @@ int analyze_audio(const char* in_path, const char* out_path, bool normalize, return 0; } +static void write_wav_header(FILE* f, uint32_t num_samples, uint32_t sample_rate) { + const uint16_t num_channels = 1; + const uint16_t bits_per_sample = 16; + const uint32_t data_size = num_samples * num_channels * (bits_per_sample / 8); + const uint32_t byte_rate = sample_rate * num_channels * (bits_per_sample / 8); + const uint16_t block_align = num_channels * (bits_per_sample / 8); + const uint32_t riff_size = 36 + data_size; + + fwrite("RIFF", 1, 4, f); + fwrite(&riff_size, 4, 1, f); + fwrite("WAVE", 1, 4, f); + fwrite("fmt ", 1, 4, f); + const uint32_t fmt_size = 16; + fwrite(&fmt_size, 4, 1, f); + const uint16_t audio_format = 1; // PCM + fwrite(&audio_format, 2, 1, f); + fwrite(&num_channels, 2, 1, f); + fwrite(&sample_rate, 4, 1, f); + fwrite(&byte_rate, 4, 1, f); + fwrite(&block_align, 2, 1, f); + fwrite(&bits_per_sample, 2, 1, f); + fwrite("data", 1, 4, f); + fwrite(&data_size, 4, 1, f); +} + +int decode_to_wav(const char* in_path, const char* out_path) { + printf("Decoding %s -> %s\n", in_path, out_path); + + FILE* f_in = fopen(in_path, "rb"); + if (!f_in) { + printf("Error: Failed to open input file: %s\n", in_path); + return 1; + } + + SpecHeader header; + if (fread(&header, sizeof(SpecHeader), 1, f_in) != 1 || + strncmp(header.magic, "SPEC", 4) != 0) { + printf("Error: Invalid spectrogram file format.\n"); + fclose(f_in); + return 1; + } + + std::vector<float> spec_data(header.num_frames * header.dct_size); + fread(spec_data.data(), sizeof(float), spec_data.size(), f_in); + fclose(f_in); + + const bool ola_mode = (header.version == SPEC_VERSION_V2_OLA); + const uint32_t sample_rate = 32000; + + std::vector<float> pcm; + + if (ola_mode) { + // IDCT + OLA (no synthesis window). + // Analysis: Hann * FDCT. Since Hann at 50% overlap satisfies + // w[n] + w[n+HOP] = 1, a rectangular synthesis window gives + // perfect reconstruction: output[n] = IDCT(X_k)[j] + IDCT(X_{k-1})[j+HOP] + // = x[n]*w[j] + x[n]*w[j+HOP] = x[n]. + const uint32_t total_samples = (uint32_t)header.num_frames * OLA_HOP_SIZE; + pcm.assign(total_samples + OLA_OVERLAP, 0.0f); + + float overlap[OLA_OVERLAP] = {}; + for (int f = 0; f < header.num_frames; ++f) { + float tmp[DCT_SIZE]; + idct_512(spec_data.data() + f * DCT_SIZE, tmp); + // First half: output samples for this frame + for (int j = 0; j < OLA_HOP_SIZE; ++j) + pcm[f * OLA_HOP_SIZE + j] = tmp[j] + overlap[j]; + // Second half: save as overlap for next frame + for (int j = 0; j < OLA_OVERLAP; ++j) + overlap[j] = tmp[OLA_HOP_SIZE + j]; + } + pcm.resize(total_samples); + } else { + const uint32_t total_samples = (uint32_t)header.num_frames * DCT_SIZE; + pcm.resize(total_samples); + for (int f = 0; f < header.num_frames; ++f) { + idct_512(spec_data.data() + f * DCT_SIZE, pcm.data() + f * DCT_SIZE); + } + } + + // Write WAV + FILE* f_out = fopen(out_path, "wb"); + if (!f_out) { + printf("Error: Failed to open output file: %s\n", out_path); + return 1; + } + + const uint32_t out_samples = (uint32_t)pcm.size(); + write_wav_header(f_out, out_samples, sample_rate); + + for (uint32_t i = 0; i < out_samples; ++i) { + float s = pcm[i]; + if (s > 1.0f) s = 1.0f; + if (s < -1.0f) s = -1.0f; + int16_t sample = (int16_t)(s * 32767.0f); + fwrite(&sample, sizeof(int16_t), 1, f_out); + } + fclose(f_out); + + printf("Decoded %d frames (%u samples) at %u Hz.\n", + header.num_frames, out_samples, sample_rate); + return 0; +} + int play_spec(const char* in_path) { printf("Playing %s\n", in_path); @@ -334,6 +438,9 @@ void print_usage() { printf( " test_gen <output.spec> Generate a test " "spectrogram.\n"); + printf( + " --wav <input.spec> <output.wav> Decode spectrogram to mono " + "WAV.\n"); printf("\nOptions for 'analyze':\n"); printf( " --normalize [rms] Normalize audio to target RMS level (default: " @@ -392,6 +499,13 @@ int main(int argc, char** argv) { return 1; } return test_gen(argv[2]); + } else if (strcmp(command, "--wav") == 0) { + if (argc < 4) { + printf("Error: '--wav' requires input .spec and output .wav files.\n"); + print_usage(); + return 1; + } + return decode_to_wav(argv[2], argv[3]); } else { printf("Error: Unknown command '%s'\n", command); print_usage(); |
