diff options
Diffstat (limited to 'tools/spectool.cc')
| -rw-r--r-- | tools/spectool.cc | 121 |
1 files changed, 107 insertions, 14 deletions
diff --git a/tools/spectool.cc b/tools/spectool.cc index 67e9ff3..4cd98c7 100644 --- a/tools/spectool.cc +++ b/tools/spectool.cc @@ -26,8 +26,13 @@ // float[num_frames * dct_size] data // struct SpecHeader { ... } -> now in audio.h -int analyze_audio(const char* in_path, const char* out_path) { - printf("Analyzing %s -> %s\n", in_path, out_path); +int analyze_audio(const char* in_path, const char* out_path, bool normalize, + float target_rms) { + printf("Analyzing %s -> %s", in_path, out_path); + if (normalize) { + printf(" (normalizing to RMS=%.3f)", target_rms); + } + printf("\n"); // Use higher quality resampling for better audio quality // Source files are typically 44.1kHz or 96kHz, 16/24-bit, mono/stereo @@ -46,19 +51,84 @@ int analyze_audio(const char* in_path, const char* out_path) { return 1; } - std::vector<float> spec_data; + // First pass: Load all PCM data (needed for normalization) + std::vector<float> pcm_data; float pcm_chunk[DCT_SIZE]; - float window[WINDOW_SIZE]; - hamming_window_512(window); - ma_uint64 frames_read; while (ma_decoder_read_pcm_frames(&decoder, pcm_chunk, DCT_SIZE, &frames_read) == MA_SUCCESS && frames_read > 0) { - if (frames_read < DCT_SIZE) { - // Zero-pad the last chunk if it's smaller - memset(pcm_chunk + frames_read, 0, - (DCT_SIZE - frames_read) * sizeof(float)); + pcm_data.insert(pcm_data.end(), pcm_chunk, pcm_chunk + frames_read); + } + ma_decoder_uninit(&decoder); + + if (pcm_data.empty()) { + printf("Error: No audio data read from file.\n"); + return 1; + } + + // Calculate RMS and peak + float rms_sum = 0.0f; + float peak = 0.0f; + for (size_t i = 0; i < pcm_data.size(); ++i) { + const float abs_val = fabsf(pcm_data[i]); + if (abs_val > peak) { + peak = abs_val; + } + rms_sum += pcm_data[i] * pcm_data[i]; + } + const float original_rms = sqrtf(rms_sum / pcm_data.size()); + printf("Original: Peak=%.3f, RMS=%.3f\n", peak, original_rms); + + // Normalize if requested + float scale_factor = 1.0f; + if (normalize && original_rms > 1e-6f) { + // Calculate scale factor to reach target RMS + scale_factor = target_rms / original_rms; + + // Check if this would cause clipping (peak > 1.0 after synthesis) + // Peak amplification varies by sample (windowing + IDCT effects) + // Use conservative limit: input peak ≤ 1.0 to guarantee output peak ≤ 1.0 + const float max_safe_peak = 1.0f; + const float predicted_peak = peak * scale_factor; + + if (predicted_peak > max_safe_peak) { + // Reduce scale factor to prevent clipping + const float peak_scale = max_safe_peak / peak; + printf("Warning: RMS normalization would cause clipping (peak=%.3f)\n", + predicted_peak); + printf(" Reducing scale to prevent clipping.\n"); + scale_factor = peak_scale; + } + + printf("Normalizing: scale factor = %.3f\n", scale_factor); + printf(" RMS: %.3f -> %.3f\n", original_rms, original_rms * scale_factor); + printf(" Peak: %.3f -> %.3f\n", peak, peak * scale_factor); + + for (size_t i = 0; i < pcm_data.size(); ++i) { + pcm_data[i] *= scale_factor; + } + } + + // Second pass: Windowing + DCT + std::vector<float> spec_data; + float window[WINDOW_SIZE]; + hamming_window_512(window); + + // Process PCM data in DCT_SIZE chunks + const size_t num_chunks = (pcm_data.size() + DCT_SIZE - 1) / DCT_SIZE; + for (size_t chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + const size_t chunk_start = chunk_idx * DCT_SIZE; + const size_t chunk_end = + (chunk_start + DCT_SIZE < pcm_data.size()) ? chunk_start + DCT_SIZE + : pcm_data.size(); + const size_t chunk_size = chunk_end - chunk_start; + + // Copy chunk (with zero-padding if needed) + memcpy(pcm_chunk, pcm_data.data() + chunk_start, + chunk_size * sizeof(float)); + if (chunk_size < DCT_SIZE) { + memset(pcm_chunk + chunk_size, 0, (DCT_SIZE - chunk_size) * sizeof(float)); } // Apply window @@ -74,8 +144,6 @@ int analyze_audio(const char* in_path, const char* out_path) { spec_data.insert(spec_data.end(), dct_chunk, dct_chunk + DCT_SIZE); } - ma_decoder_uninit(&decoder); - // --- Trim Silent Frames --- const float epsilon = 1e-6f; int num_frames = spec_data.size() / DCT_SIZE; @@ -248,7 +316,7 @@ int test_gen(const char* out_path) { } void print_usage() { - printf("Usage: spectool <command> <input> [output]\n"); + printf("Usage: spectool <command> <input> [output] [options]\n"); printf("Commands:\n"); printf( " analyze <input.wav|.mp3> <output.spec> Analyze an audio file and " @@ -258,6 +326,11 @@ void print_usage() { printf( " test_gen <output.spec> Generate a test " "spectrogram.\n"); + printf("\nOptions for 'analyze':\n"); + printf(" --normalize [rms] Normalize audio to target RMS level (default: " + "0.15)\n"); + printf( + " Ensures consistent loudness across all samples.\n"); } int main(int argc, char** argv) { @@ -274,7 +347,27 @@ int main(int argc, char** argv) { print_usage(); return 1; } - return analyze_audio(argv[2], argv[3]); + + // Parse optional flags + bool normalize = false; + float target_rms = 0.15f; // Default target RMS + + for (int i = 4; i < argc; ++i) { + if (strcmp(argv[i], "--normalize") == 0) { + normalize = true; + // Check if next arg is a number (custom target RMS) + if (i + 1 < argc) { + char* endptr; + float custom_rms = strtof(argv[i + 1], &endptr); + if (endptr != argv[i + 1] && custom_rms > 0.0f && custom_rms < 1.0f) { + target_rms = custom_rms; + ++i; // Consume the RMS value + } + } + } + } + + return analyze_audio(argv[2], argv[3], normalize, target_rms); } else if (strcmp(command, "play") == 0) { if (argc < 3) { printf("Error: 'play' command requires an input file.\n"); |
