summaryrefslogtreecommitdiff
path: root/tools/spectool.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tools/spectool.cc')
-rw-r--r--tools/spectool.cc121
1 files changed, 107 insertions, 14 deletions
diff --git a/tools/spectool.cc b/tools/spectool.cc
index 67e9ff3..4cd98c7 100644
--- a/tools/spectool.cc
+++ b/tools/spectool.cc
@@ -26,8 +26,13 @@
// float[num_frames * dct_size] data
// struct SpecHeader { ... } -> now in audio.h
-int analyze_audio(const char* in_path, const char* out_path) {
- printf("Analyzing %s -> %s\n", in_path, out_path);
+int analyze_audio(const char* in_path, const char* out_path, bool normalize,
+ float target_rms) {
+ printf("Analyzing %s -> %s", in_path, out_path);
+ if (normalize) {
+ printf(" (normalizing to RMS=%.3f)", target_rms);
+ }
+ printf("\n");
// Use higher quality resampling for better audio quality
// Source files are typically 44.1kHz or 96kHz, 16/24-bit, mono/stereo
@@ -46,19 +51,84 @@ int analyze_audio(const char* in_path, const char* out_path) {
return 1;
}
- std::vector<float> spec_data;
+ // First pass: Load all PCM data (needed for normalization)
+ std::vector<float> pcm_data;
float pcm_chunk[DCT_SIZE];
- float window[WINDOW_SIZE];
- hamming_window_512(window);
-
ma_uint64 frames_read;
while (ma_decoder_read_pcm_frames(&decoder, pcm_chunk, DCT_SIZE,
&frames_read) == MA_SUCCESS &&
frames_read > 0) {
- if (frames_read < DCT_SIZE) {
- // Zero-pad the last chunk if it's smaller
- memset(pcm_chunk + frames_read, 0,
- (DCT_SIZE - frames_read) * sizeof(float));
+ pcm_data.insert(pcm_data.end(), pcm_chunk, pcm_chunk + frames_read);
+ }
+ ma_decoder_uninit(&decoder);
+
+ if (pcm_data.empty()) {
+ printf("Error: No audio data read from file.\n");
+ return 1;
+ }
+
+ // Calculate RMS and peak
+ float rms_sum = 0.0f;
+ float peak = 0.0f;
+ for (size_t i = 0; i < pcm_data.size(); ++i) {
+ const float abs_val = fabsf(pcm_data[i]);
+ if (abs_val > peak) {
+ peak = abs_val;
+ }
+ rms_sum += pcm_data[i] * pcm_data[i];
+ }
+ const float original_rms = sqrtf(rms_sum / pcm_data.size());
+ printf("Original: Peak=%.3f, RMS=%.3f\n", peak, original_rms);
+
+ // Normalize if requested
+ float scale_factor = 1.0f;
+ if (normalize && original_rms > 1e-6f) {
+ // Calculate scale factor to reach target RMS
+ scale_factor = target_rms / original_rms;
+
+ // Check if this would cause clipping (peak > 1.0 after synthesis)
+ // Peak amplification varies by sample (windowing + IDCT effects)
+ // Use conservative limit: input peak ≤ 1.0 to guarantee output peak ≤ 1.0
+ const float max_safe_peak = 1.0f;
+ const float predicted_peak = peak * scale_factor;
+
+ if (predicted_peak > max_safe_peak) {
+ // Reduce scale factor to prevent clipping
+ const float peak_scale = max_safe_peak / peak;
+ printf("Warning: RMS normalization would cause clipping (peak=%.3f)\n",
+ predicted_peak);
+ printf(" Reducing scale to prevent clipping.\n");
+ scale_factor = peak_scale;
+ }
+
+ printf("Normalizing: scale factor = %.3f\n", scale_factor);
+ printf(" RMS: %.3f -> %.3f\n", original_rms, original_rms * scale_factor);
+ printf(" Peak: %.3f -> %.3f\n", peak, peak * scale_factor);
+
+ for (size_t i = 0; i < pcm_data.size(); ++i) {
+ pcm_data[i] *= scale_factor;
+ }
+ }
+
+ // Second pass: Windowing + DCT
+ std::vector<float> spec_data;
+ float window[WINDOW_SIZE];
+ hamming_window_512(window);
+
+ // Process PCM data in DCT_SIZE chunks
+ const size_t num_chunks = (pcm_data.size() + DCT_SIZE - 1) / DCT_SIZE;
+ for (size_t chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+ const size_t chunk_start = chunk_idx * DCT_SIZE;
+ const size_t chunk_end =
+ (chunk_start + DCT_SIZE < pcm_data.size()) ? chunk_start + DCT_SIZE
+ : pcm_data.size();
+ const size_t chunk_size = chunk_end - chunk_start;
+
+ // Copy chunk (with zero-padding if needed)
+ memcpy(pcm_chunk, pcm_data.data() + chunk_start,
+ chunk_size * sizeof(float));
+ if (chunk_size < DCT_SIZE) {
+ memset(pcm_chunk + chunk_size, 0, (DCT_SIZE - chunk_size) * sizeof(float));
}
// Apply window
@@ -74,8 +144,6 @@ int analyze_audio(const char* in_path, const char* out_path) {
spec_data.insert(spec_data.end(), dct_chunk, dct_chunk + DCT_SIZE);
}
- ma_decoder_uninit(&decoder);
-
// --- Trim Silent Frames ---
const float epsilon = 1e-6f;
int num_frames = spec_data.size() / DCT_SIZE;
@@ -248,7 +316,7 @@ int test_gen(const char* out_path) {
}
void print_usage() {
- printf("Usage: spectool <command> <input> [output]\n");
+ printf("Usage: spectool <command> <input> [output] [options]\n");
printf("Commands:\n");
printf(
" analyze <input.wav|.mp3> <output.spec> Analyze an audio file and "
@@ -258,6 +326,11 @@ void print_usage() {
printf(
" test_gen <output.spec> Generate a test "
"spectrogram.\n");
+ printf("\nOptions for 'analyze':\n");
+ printf(" --normalize [rms] Normalize audio to target RMS level (default: "
+ "0.15)\n");
+ printf(
+ " Ensures consistent loudness across all samples.\n");
}
int main(int argc, char** argv) {
@@ -274,7 +347,27 @@ int main(int argc, char** argv) {
print_usage();
return 1;
}
- return analyze_audio(argv[2], argv[3]);
+
+ // Parse optional flags
+ bool normalize = false;
+ float target_rms = 0.15f; // Default target RMS
+
+ for (int i = 4; i < argc; ++i) {
+ if (strcmp(argv[i], "--normalize") == 0) {
+ normalize = true;
+ // Check if next arg is a number (custom target RMS)
+ if (i + 1 < argc) {
+ char* endptr;
+ float custom_rms = strtof(argv[i + 1], &endptr);
+ if (endptr != argv[i + 1] && custom_rms > 0.0f && custom_rms < 1.0f) {
+ target_rms = custom_rms;
+ ++i; // Consume the RMS value
+ }
+ }
+ }
+ }
+
+ return analyze_audio(argv[2], argv[3], normalize, target_rms);
} else if (strcmp(command, "play") == 0) {
if (argc < 3) {
printf("Error: 'play' command requires an input file.\n");