src/audio/gen.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

// This file is part of the 64k demo project.
// It implements the procedural audio generation logic.
// Uses IDCT/FDCT to synthesize notes in the frequency domain.

#include "audio/gen.h"
#include "audio/dct.h"
#include "audio/window.h"
#include <math.h>
#include <stdlib.h>
#include <string.h>

std::vector<float> generate_note_spectrogram(const NoteParams& params,
                                             int* out_num_frames) {
  int num_frames = (int)(params.duration_sec * 32000.0f / DCT_SIZE);
  if (num_frames < 1)
    num_frames = 1;
  *out_num_frames = num_frames;

  std::vector<float> spec_data(num_frames * DCT_SIZE, 0.0f);
  float window[WINDOW_SIZE];
  hamming_window_512(window);

  float phase = 0.0f;
  float time_step = 1.0f / 32000.0f;

  for (int f = 0; f < num_frames; ++f) {
    float pcm_chunk[DCT_SIZE] = {0};
    float frame_time = f * DCT_SIZE * time_step;

    // Generate PCM for this frame
    for (int i = 0; i < DCT_SIZE; ++i) {
      float t = frame_time + i * time_step;

      // Envelope (Simple Attack)
      float env = 1.0f;
      if (t < params.attack_sec) {
        env = t / params.attack_sec;
      }

      // Vibrato
      float vib = sinf(t * params.vibrato_rate * 2.0f * 3.14159f) *
                  params.vibrato_depth;

      // Randomness
      float pitch_rnd =
          ((float)rand() / RAND_MAX - 0.5f) * params.pitch_randomness;
      float amp_rnd = ((float)rand() / RAND_MAX - 0.5f) * params.amp_randomness;

      float sample = 0.0f;
      for (int h = 1; h <= params.num_harmonics; ++h) {
        float h_amp = powf(params.harmonic_decay, h - 1);
        float freq = (params.base_freq + vib + pitch_rnd) * h;
        sample += sinf(phase * h) * h_amp;
      }

      // Update phase for fundamental (approximate, since freq changes)
      phase +=
          (params.base_freq + vib + pitch_rnd) * 2.0f * 3.14159f * time_step;

      pcm_chunk[i] = sample * params.amplitude * env * (1.0f + amp_rnd);
    }

    // Apply window
    for (int i = 0; i < DCT_SIZE; ++i) {
      pcm_chunk[i] *= window[i];
    }

    // Apply FDCT
    float dct_chunk[DCT_SIZE];
    fdct_512(pcm_chunk, dct_chunk);

    // Scale up to compensate for orthonormal normalization
    // Old non-orthonormal DCT had no sqrt scaling, so output was ~sqrt(N/2) larger
    // Scale factor: sqrt(DCT_SIZE / 2) = sqrt(256) = 16
    //
    // HOWEVER: After removing synthesis windowing (commit f998bfc), audio is louder.
    // The old synthesis incorrectly applied Hamming window to spectrum (reducing energy by 0.63x).
    // New synthesis is correct (no window), but procedural notes with 16x scaling are too loud.
    //
    // Analysis applies Hamming window (0.63x energy). With 16x scaling: 0.63 × 16 ≈ 10x.
    // Divide by 2.5 to match the relative loudness increase: 16 / 2.5 = 6.4
    const float scale_factor = sqrtf(DCT_SIZE / 2.0f) / 2.5f;

    // Copy to buffer with scaling
    for (int i = 0; i < DCT_SIZE; ++i) {
      spec_data[f * DCT_SIZE + i] = dct_chunk[i] * scale_factor;
    }
  }

  // Normalize to consistent RMS level (matching spectool --normalize behavior)
  // 1. Synthesize PCM to measure actual output levels
  std::vector<float> pcm_data(num_frames * DCT_SIZE);
  for (int f = 0; f < num_frames; ++f) {
    const float* spectral_frame = spec_data.data() + (f * DCT_SIZE);
    float* time_frame = pcm_data.data() + (f * DCT_SIZE);
    idct_512(spectral_frame, time_frame);
  }

  // 2. Calculate RMS and peak
  float rms_sum = 0.0f;
  float peak = 0.0f;
  for (size_t i = 0; i < pcm_data.size(); ++i) {
    const float abs_val = fabsf(pcm_data[i]);
    if (abs_val > peak) {
      peak = abs_val;
    }
    rms_sum += pcm_data[i] * pcm_data[i];
  }
  const float rms = sqrtf(rms_sum / pcm_data.size());

  // 3. Normalize to target RMS (0.15, matching spectool default)
  const float target_rms = 0.15f;
  const float max_safe_peak = 1.0f; // Conservative: ensure output peak ≤ 1.0

  if (rms > 1e-6f) {
    // Calculate scale factor to reach target RMS
    float norm_scale = target_rms / rms;

    // Check if this would cause clipping
    const float predicted_peak = peak * norm_scale;
    if (predicted_peak > max_safe_peak) {
      // Reduce scale to prevent clipping
      norm_scale = max_safe_peak / peak;
    }

    // Apply normalization scale to spectrogram
    for (size_t i = 0; i < spec_data.size(); ++i) {
      spec_data[i] *= norm_scale;
    }
  }

  return spec_data;
}

void paste_spectrogram(std::vector<float>& dest_data, int* dest_num_frames,
                       const std::vector<float>& src_data, int src_num_frames,
                       int frame_offset) {
  if (src_num_frames <= 0)
    return;

  int needed_frames = frame_offset + src_num_frames;
  if (needed_frames > *dest_num_frames) {
    dest_data.resize(needed_frames * DCT_SIZE, 0.0f);
    *dest_num_frames = needed_frames;
  }

  for (int f = 0; f < src_num_frames; ++f) {
    int dst_frame_idx = frame_offset + f;
    if (dst_frame_idx < 0)
      continue;

    for (int i = 0; i < DCT_SIZE; ++i) {
      dest_data[dst_frame_idx * DCT_SIZE + i] += src_data[f * DCT_SIZE + i];
    }
  }
}

void apply_spectral_noise(std::vector<float>& data, int num_frames,
                          float amount) {
  for (int f = 0; f < num_frames; ++f) {
    for (int i = 0; i < DCT_SIZE; ++i) {
      float rnd = ((float)rand() / RAND_MAX - 0.5f) * 2.0f * amount;
      data[f * DCT_SIZE + i] *= (1.0f + rnd);
    }
  }
}

void apply_spectral_lowpass(std::vector<float>& data, int num_frames,
                            float cutoff_ratio) {
  int cutoff_bin = (int)(cutoff_ratio * DCT_SIZE);
  if (cutoff_bin < 0)
    cutoff_bin = 0;
  if (cutoff_bin >= DCT_SIZE)
    return;

  for (int f = 0; f < num_frames; ++f) {
    for (int i = cutoff_bin; i < DCT_SIZE; ++i) {
      data[f * DCT_SIZE + i] = 0.0f;
    }
  }
}

void apply_spectral_comb(std::vector<float>& data, int num_frames,
                         float period_bins, float depth) {
  for (int i = 0; i < DCT_SIZE; ++i) {
    float mod =
        1.0f - depth * (0.5f + 0.5f * cosf((float)i / period_bins * 6.28318f));
    for (int f = 0; f < num_frames; ++f) {
      data[f * DCT_SIZE + i] *= mod;
    }
  }
}