Rthur2003 commited on
Commit
86b258b
·
1 Parent(s): 9afcc8e

feat: add vocal analysis module for AI music detection

Browse files
Files changed (1) hide show
  1. app/services/vocal_analyzer.py +646 -0
app/services/vocal_analyzer.py ADDED
@@ -0,0 +1,646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vocal analysis for AI music detection.
3
+
4
+ Separates vocals from instruments and analyzes vocal characteristics
5
+ that distinguish AI-generated singing from real human vocals.
6
+
7
+ Key detection signals:
8
+ - Formant consistency (AI has unnaturally smooth or irregular formants)
9
+ - Pitch micro-variation (humans have 5-20 cent natural jitter)
10
+ - Breath patterns (AI either omits or over-regularizes breath sounds)
11
+ - Vibrato regularity (AI vibrato is mathematically perfect)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import io
17
+ from dataclasses import dataclass, field
18
+ from pathlib import Path
19
+ from typing import Optional, Union
20
+
21
+ import numpy as np
22
+ import librosa
23
+
24
+ from .logging_config import get_logger
25
+
26
+ logger = get_logger(__name__)
27
+
28
+ # ── Constants ────────────────────────────────────────────────────────────
29
+ _TARGET_SR = 22050
30
+ _HOP_LENGTH = 512
31
+ _DURATION_LIMIT = 120.0
32
+ _MIN_VOCAL_ENERGY = 1e-5 # Threshold for "vocals present"
33
+
34
+
35
+ @dataclass
36
+ class VocalFeatures:
37
+ """Vocal-specific analysis results."""
38
+
39
+ has_vocals: bool
40
+ vocal_confidence: float # 0.0-1.0, how confident we are vocals exist
41
+ vocal_ai_score: float # 0.0-1.0, overall vocal AI likelihood
42
+
43
+ # Sub-scores
44
+ pitch_stability_score: float # High = unnaturally stable = AI-like
45
+ vibrato_regularity_score: float
46
+ formant_consistency_score: float
47
+ breath_pattern_score: float
48
+ vocal_texture_score: float
49
+
50
+ # Raw metrics
51
+ pitch_mean_hz: float
52
+ pitch_std_cents: float # Standard deviation of pitch in cents
53
+ vibrato_rate_hz: float
54
+ vibrato_extent_cents: float
55
+ vocal_harmonic_ratio: float
56
+ vocal_energy_ratio: float # vocal energy / total energy
57
+
58
+ indicators: list[str] = field(default_factory=list)
59
+
60
+
61
+ def analyze_vocals(
62
+ source: Union[Path, bytes, io.BytesIO],
63
+ *,
64
+ sr: Optional[int] = None,
65
+ ) -> VocalFeatures:
66
+ """
67
+ Analyze vocal characteristics of an audio source.
68
+
69
+ Uses harmonic-percussive-vocal separation and pitch tracking
70
+ to identify AI-generated vocal patterns.
71
+
72
+ Args:
73
+ source: Audio file path, bytes, or BytesIO.
74
+ sr: Target sample rate.
75
+
76
+ Returns:
77
+ VocalFeatures with scores and raw metrics.
78
+ """
79
+ target_sr = sr or _TARGET_SR
80
+ y, actual_sr = _load_audio(source, target_sr)
81
+ duration_sec = len(y) / actual_sr
82
+
83
+ logger.info(f"Vocal analysis: {duration_sec:.1f}s audio @ {actual_sr}Hz")
84
+
85
+ # ── Step 1: Separate vocals from accompaniment ───────────────────
86
+ y_vocal, y_accompaniment = _separate_vocals(y, actual_sr)
87
+
88
+ # ── Step 2: Check if vocals are present ──────────────────────────
89
+ vocal_energy = float(np.sum(y_vocal ** 2))
90
+ total_energy = float(np.sum(y ** 2))
91
+ vocal_energy_ratio = vocal_energy / (total_energy + 1e-10)
92
+
93
+ has_vocals = vocal_energy_ratio > 0.05 # At least 5% vocal energy
94
+
95
+ if not has_vocals:
96
+ logger.info("No significant vocals detected")
97
+ return VocalFeatures(
98
+ has_vocals=False,
99
+ vocal_confidence=vocal_energy_ratio,
100
+ vocal_ai_score=0.0,
101
+ pitch_stability_score=0.0,
102
+ vibrato_regularity_score=0.0,
103
+ formant_consistency_score=0.0,
104
+ breath_pattern_score=0.0,
105
+ vocal_texture_score=0.0,
106
+ pitch_mean_hz=0.0,
107
+ pitch_std_cents=0.0,
108
+ vibrato_rate_hz=0.0,
109
+ vibrato_extent_cents=0.0,
110
+ vocal_harmonic_ratio=0.0,
111
+ vocal_energy_ratio=vocal_energy_ratio,
112
+ indicators=["No significant vocal content detected in audio."],
113
+ )
114
+
115
+ # ── Step 3: Pitch tracking on vocal ──────────────────────────────
116
+ pitch_data = _analyze_pitch(y_vocal, actual_sr)
117
+
118
+ # ── Step 4: Vibrato analysis ─────────────────────────────────────
119
+ vibrato_data = _analyze_vibrato(pitch_data["f0_hz"], actual_sr)
120
+
121
+ # ── Step 5: Formant analysis (via spectral envelope) ─────────────
122
+ formant_data = _analyze_formants(y_vocal, actual_sr)
123
+
124
+ # ── Step 6: Breath / micro-silence detection ─────────────────────
125
+ breath_data = _analyze_breath_patterns(y_vocal, actual_sr)
126
+
127
+ # ── Step 7: Vocal texture (harmonic richness of vocal) ───────────
128
+ texture_data = _analyze_vocal_texture(y_vocal, actual_sr)
129
+
130
+ # ── Step 8: Compute sub-scores ───────────────────────────────────
131
+ pitch_score = _score_pitch_stability(pitch_data)
132
+ vibrato_score = _score_vibrato_regularity(vibrato_data)
133
+ formant_score = _score_formant_consistency(formant_data)
134
+ breath_score = _score_breath_patterns(breath_data)
135
+ texture_score = _score_vocal_texture(texture_data)
136
+
137
+ # ── Step 9: Overall vocal AI score ───────────────────────────────
138
+ vocal_ai_score = (
139
+ pitch_score * 0.25
140
+ + vibrato_score * 0.20
141
+ + formant_score * 0.25
142
+ + breath_score * 0.15
143
+ + texture_score * 0.15
144
+ )
145
+ vocal_ai_score = round(max(0.0, min(0.99, vocal_ai_score)), 3)
146
+
147
+ # ── Step 10: Build indicators ────────────────────────────────────
148
+ indicators = _build_vocal_indicators(
149
+ vocal_ai_score, pitch_score, vibrato_score,
150
+ formant_score, breath_score, pitch_data
151
+ )
152
+
153
+ return VocalFeatures(
154
+ has_vocals=True,
155
+ vocal_confidence=min(1.0, vocal_energy_ratio * 5),
156
+ vocal_ai_score=vocal_ai_score,
157
+ pitch_stability_score=round(pitch_score, 3),
158
+ vibrato_regularity_score=round(vibrato_score, 3),
159
+ formant_consistency_score=round(formant_score, 3),
160
+ breath_pattern_score=round(breath_score, 3),
161
+ vocal_texture_score=round(texture_score, 3),
162
+ pitch_mean_hz=pitch_data["f0_mean"],
163
+ pitch_std_cents=pitch_data["f0_std_cents"],
164
+ vibrato_rate_hz=vibrato_data["rate_hz"],
165
+ vibrato_extent_cents=vibrato_data["extent_cents"],
166
+ vocal_harmonic_ratio=texture_data["vocal_harmonic_ratio"],
167
+ vocal_energy_ratio=vocal_energy_ratio,
168
+ indicators=indicators,
169
+ )
170
+
171
+
172
+ # ═══════════════════════════════════════════════════════════════════════
173
+ # PRIVATE — Audio loading
174
+ # ═══════════════════════════════════════════════════════════════════════
175
+
176
+ def _load_audio(
177
+ source: Union[Path, bytes, io.BytesIO], target_sr: int
178
+ ) -> tuple[np.ndarray, int]:
179
+ if isinstance(source, bytes):
180
+ source = io.BytesIO(source)
181
+ y, sr = librosa.load(source, sr=target_sr, mono=True, duration=_DURATION_LIMIT)
182
+ if len(y) < sr:
183
+ raise ValueError("Audio too short for vocal analysis (< 1s)")
184
+ return y, sr
185
+
186
+
187
+ # ═══════════════════════════════════════════════════════════════════════
188
+ # PRIVATE — Vocal separation
189
+ # ═══════════════════════════════════════════════════════════════════════
190
+
191
+ def _separate_vocals(y: np.ndarray, sr: int) -> tuple[np.ndarray, np.ndarray]:
192
+ """
193
+ Separate vocals from accompaniment using harmonic-percussive
194
+ source separation with spectral masking.
195
+
196
+ This is a lightweight alternative to Demucs/Spleeter that works
197
+ without GPU or large model downloads. For production, replace
198
+ with Demucs for better quality.
199
+ """
200
+ # HPSS to get harmonic component (vocals + melodic instruments)
201
+ y_harmonic, y_percussive = librosa.effects.hpss(y, margin=3.0)
202
+
203
+ # Use spectral masking to isolate vocal frequency range (80Hz-4kHz)
204
+ S = librosa.stft(y_harmonic, n_fft=2048, hop_length=_HOP_LENGTH)
205
+ freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
206
+
207
+ # Vocal frequency mask
208
+ vocal_mask = np.zeros_like(freqs)
209
+ vocal_range = (freqs >= 80) & (freqs <= 4000)
210
+ vocal_mask[vocal_range] = 1.0
211
+
212
+ # Smooth the mask edges
213
+ from scipy.ndimage import gaussian_filter1d
214
+ vocal_mask = gaussian_filter1d(vocal_mask, sigma=3)
215
+
216
+ # Apply mask
217
+ S_vocal = S * vocal_mask[:, np.newaxis]
218
+ S_accomp = S * (1.0 - vocal_mask[:, np.newaxis])
219
+
220
+ y_vocal = librosa.istft(S_vocal, hop_length=_HOP_LENGTH, length=len(y))
221
+ y_accomp = librosa.istft(S_accomp, hop_length=_HOP_LENGTH, length=len(y))
222
+
223
+ return y_vocal, y_accomp
224
+
225
+
226
+ # ═══════════════════════════════════════════════════════════════════════
227
+ # PRIVATE — Pitch analysis
228
+ # ═══════════════════════════════════════════════════════════════════════
229
+
230
+ def _analyze_pitch(y_vocal: np.ndarray, sr: int) -> dict:
231
+ """Extract pitch (f0) from vocal signal using pyin."""
232
+ f0, voiced_flag, voiced_probs = librosa.pyin(
233
+ y_vocal,
234
+ fmin=librosa.note_to_hz('C2'), # ~65 Hz
235
+ fmax=librosa.note_to_hz('C7'), # ~2093 Hz
236
+ sr=sr,
237
+ hop_length=_HOP_LENGTH,
238
+ )
239
+
240
+ # Filter to voiced frames only
241
+ voiced_f0 = f0[voiced_flag]
242
+
243
+ if len(voiced_f0) < 10:
244
+ return {
245
+ "f0_hz": f0,
246
+ "f0_mean": 0.0,
247
+ "f0_std_hz": 0.0,
248
+ "f0_std_cents": 0.0,
249
+ "voiced_ratio": 0.0,
250
+ "pitch_jitter": 0.0,
251
+ "pitch_range_semitones": 0.0,
252
+ }
253
+
254
+ f0_mean = float(np.mean(voiced_f0))
255
+ f0_std = float(np.std(voiced_f0))
256
+
257
+ # Convert to cents for perceptual accuracy
258
+ # 1 cent = 1/100 of a semitone
259
+ cents = 1200 * np.log2(voiced_f0 / f0_mean)
260
+ f0_std_cents = float(np.std(cents))
261
+
262
+ # Pitch jitter — frame-to-frame pitch variation
263
+ if len(voiced_f0) > 1:
264
+ jitter_cents = 1200 * np.abs(np.log2(voiced_f0[1:] / voiced_f0[:-1]))
265
+ pitch_jitter = float(np.mean(jitter_cents))
266
+ else:
267
+ pitch_jitter = 0.0
268
+
269
+ # Pitch range in semitones
270
+ pitch_range = float(12 * np.log2(np.max(voiced_f0) / np.min(voiced_f0)))
271
+
272
+ voiced_ratio = float(np.sum(voiced_flag) / len(voiced_flag))
273
+
274
+ return {
275
+ "f0_hz": f0,
276
+ "f0_mean": f0_mean,
277
+ "f0_std_hz": f0_std,
278
+ "f0_std_cents": f0_std_cents,
279
+ "voiced_ratio": voiced_ratio,
280
+ "pitch_jitter": pitch_jitter,
281
+ "pitch_range_semitones": pitch_range,
282
+ }
283
+
284
+
285
+ # ═══════════════════════════════════════════════════════════════════════
286
+ # PRIVATE — Vibrato analysis
287
+ # ═══════════════════════════════════════════════════════════════════════
288
+
289
+ def _analyze_vibrato(f0_hz: np.ndarray, sr: int) -> dict:
290
+ """Analyze vibrato characteristics from pitch contour."""
291
+ voiced = f0_hz[~np.isnan(f0_hz)]
292
+
293
+ if len(voiced) < 20:
294
+ return {
295
+ "rate_hz": 0.0,
296
+ "extent_cents": 0.0,
297
+ "regularity": 0.0,
298
+ }
299
+
300
+ # Detrend pitch to isolate oscillation
301
+ from scipy.signal import detrend
302
+ detrended = detrend(voiced)
303
+
304
+ # Convert to cents
305
+ mean_f0 = np.mean(voiced)
306
+ if mean_f0 < 1:
307
+ return {"rate_hz": 0.0, "extent_cents": 0.0, "regularity": 0.0}
308
+
309
+ cents_deviation = 1200 * np.log2((voiced) / mean_f0)
310
+ cents_detrended = detrend(cents_deviation)
311
+
312
+ # FFT to find vibrato rate
313
+ hop_rate = sr / _HOP_LENGTH # frames per second
314
+ fft = np.abs(np.fft.rfft(cents_detrended))
315
+ freqs = np.fft.rfftfreq(len(cents_detrended), d=1.0 / hop_rate)
316
+
317
+ # Vibrato typically 4-8 Hz
318
+ vibrato_range = (freqs >= 3) & (freqs <= 10)
319
+ if not np.any(vibrato_range):
320
+ return {"rate_hz": 0.0, "extent_cents": 0.0, "regularity": 0.0}
321
+
322
+ fft_vibrato = fft.copy()
323
+ fft_vibrato[~vibrato_range] = 0
324
+
325
+ peak_idx = np.argmax(fft_vibrato)
326
+ vibrato_rate = float(freqs[peak_idx])
327
+ vibrato_power = float(fft[peak_idx])
328
+
329
+ # Extent — average deviation in cents
330
+ extent_cents = float(np.std(cents_detrended)) * 2 # ~peak-to-peak
331
+
332
+ # Regularity — how periodic is the vibrato
333
+ total_power = float(np.sum(fft[vibrato_range] ** 2))
334
+ peak_power = float(fft[peak_idx] ** 2)
335
+ regularity = peak_power / (total_power + 1e-10)
336
+
337
+ return {
338
+ "rate_hz": vibrato_rate,
339
+ "extent_cents": extent_cents,
340
+ "regularity": float(regularity),
341
+ }
342
+
343
+
344
+ # ═══════════════════════════════════════════════════════════════════════
345
+ # PRIVATE — Formant analysis
346
+ # ═══════════════════════════════════════════════════════════════════════
347
+
348
+ def _analyze_formants(y_vocal: np.ndarray, sr: int) -> dict:
349
+ """
350
+ Analyze formant consistency via spectral envelope.
351
+
352
+ Uses LPC (Linear Predictive Coding) to estimate formant
353
+ frequencies and tracks their stability over time.
354
+ """
355
+ frame_length = 2048
356
+ hop = _HOP_LENGTH
357
+ n_frames = (len(y_vocal) - frame_length) // hop
358
+
359
+ if n_frames < 5:
360
+ return {"f1_std": 0.0, "f2_std": 0.0, "formant_stability": 0.0}
361
+
362
+ formant_tracks = {1: [], 2: [], 3: []}
363
+
364
+ for i in range(min(n_frames, 200)): # Limit to 200 frames
365
+ start = i * hop
366
+ frame = y_vocal[start: start + frame_length]
367
+
368
+ if np.max(np.abs(frame)) < 1e-6:
369
+ continue
370
+
371
+ # Apply window
372
+ frame = frame * np.hamming(len(frame))
373
+
374
+ # LPC analysis (order 12-16 works well for formants)
375
+ try:
376
+ lpc_order = min(16, len(frame) - 1)
377
+ a = librosa.lpc(frame, order=lpc_order)
378
+
379
+ # Find formant frequencies from LPC roots
380
+ roots = np.roots(a)
381
+ roots = roots[np.imag(roots) >= 0] # Keep positive frequencies
382
+
383
+ angles = np.angle(roots)
384
+ freqs_hz = angles * (sr / (2 * np.pi))
385
+
386
+ # Filter to reasonable formant ranges
387
+ formants = sorted(f for f in freqs_hz if 200 < f < 5000)
388
+
389
+ if len(formants) >= 3:
390
+ formant_tracks[1].append(formants[0])
391
+ formant_tracks[2].append(formants[1])
392
+ formant_tracks[3].append(formants[2])
393
+ except Exception:
394
+ continue
395
+
396
+ if len(formant_tracks[1]) < 5:
397
+ return {"f1_std": 0.0, "f2_std": 0.0, "formant_stability": 0.0}
398
+
399
+ f1_std = float(np.std(formant_tracks[1]))
400
+ f2_std = float(np.std(formant_tracks[2]))
401
+
402
+ # Formant stability: lower = more stable = potentially more AI-like
403
+ formant_stability = float(np.mean([
404
+ np.std(formant_tracks[1]) / (np.mean(formant_tracks[1]) + 1e-10),
405
+ np.std(formant_tracks[2]) / (np.mean(formant_tracks[2]) + 1e-10),
406
+ ]))
407
+
408
+ return {
409
+ "f1_std": f1_std,
410
+ "f2_std": f2_std,
411
+ "formant_stability": formant_stability,
412
+ }
413
+
414
+
415
+ # ═══════════════════════════════════════════════════════════════════════
416
+ # PRIVATE — Breath pattern analysis
417
+ # ═══════════════════════════════════════════════════════════════════════
418
+
419
+ def _analyze_breath_patterns(y_vocal: np.ndarray, sr: int) -> dict:
420
+ """
421
+ Detect breath-like sounds and silence patterns.
422
+
423
+ Human singers have irregular breath sounds between phrases.
424
+ AI either omits them or produces unnaturally regular patterns.
425
+ """
426
+ # RMS energy envelope
427
+ rms = librosa.feature.rms(y=y_vocal, hop_length=_HOP_LENGTH)[0]
428
+
429
+ # Silence threshold (relative)
430
+ silence_thresh = np.mean(rms) * 0.15
431
+
432
+ # Find silence segments (potential breath locations)
433
+ is_quiet = rms < silence_thresh
434
+ quiet_segments = _find_segments(is_quiet)
435
+
436
+ # Filter to breath-like durations (0.1s - 1.0s)
437
+ hop_sec = _HOP_LENGTH / sr
438
+ breath_like = [
439
+ seg for seg in quiet_segments
440
+ if 0.1 <= seg["duration"] * hop_sec <= 1.0
441
+ ]
442
+
443
+ breath_count = len(breath_like)
444
+
445
+ if breath_count < 2:
446
+ return {
447
+ "breath_count": breath_count,
448
+ "breath_regularity": 0.0,
449
+ "breath_density": 0.0,
450
+ }
451
+
452
+ # Inter-breath intervals
453
+ breath_starts = [seg["start"] * hop_sec for seg in breath_like]
454
+ ibi = np.diff(breath_starts)
455
+
456
+ breath_regularity = float(np.std(ibi) / (np.mean(ibi) + 1e-10))
457
+ duration_sec = len(y_vocal) / sr
458
+ breath_density = breath_count / duration_sec
459
+
460
+ return {
461
+ "breath_count": breath_count,
462
+ "breath_regularity": breath_regularity,
463
+ "breath_density": breath_density,
464
+ }
465
+
466
+
467
+ def _find_segments(mask: np.ndarray) -> list[dict]:
468
+ """Find contiguous True segments in a boolean array."""
469
+ segments = []
470
+ in_segment = False
471
+ start = 0
472
+
473
+ for i, val in enumerate(mask):
474
+ if val and not in_segment:
475
+ start = i
476
+ in_segment = True
477
+ elif not val and in_segment:
478
+ segments.append({"start": start, "duration": i - start})
479
+ in_segment = False
480
+
481
+ if in_segment:
482
+ segments.append({"start": start, "duration": len(mask) - start})
483
+
484
+ return segments
485
+
486
+
487
+ # ═══════════════════════════════════════════════════════════════════════
488
+ # PRIVATE — Vocal texture
489
+ # ═══════════════════════════════════════════════════════════════════════
490
+
491
+ def _analyze_vocal_texture(y_vocal: np.ndarray, sr: int) -> dict:
492
+ """Analyze the harmonic richness and texture of the vocal."""
493
+ y_h, y_p = librosa.effects.hpss(y_vocal)
494
+ h_energy = float(np.sum(y_h ** 2))
495
+ total = float(np.sum(y_vocal ** 2))
496
+ vocal_harmonic_ratio = h_energy / (total + 1e-10)
497
+
498
+ # Spectral roll-off — where 85% of energy is below
499
+ rolloff = librosa.feature.spectral_rolloff(
500
+ y=y_vocal, sr=sr, hop_length=_HOP_LENGTH, roll_percent=0.85
501
+ )[0]
502
+ rolloff_std = float(np.std(rolloff))
503
+ rolloff_mean = float(np.mean(rolloff))
504
+
505
+ # MFCC variance on vocal
506
+ mfcc = librosa.feature.mfcc(y=y_vocal, sr=sr, n_mfcc=13, hop_length=_HOP_LENGTH)
507
+ mfcc_var = float(np.mean(np.var(mfcc, axis=1)))
508
+
509
+ return {
510
+ "vocal_harmonic_ratio": vocal_harmonic_ratio,
511
+ "rolloff_std": rolloff_std,
512
+ "rolloff_mean": rolloff_mean,
513
+ "mfcc_var": mfcc_var,
514
+ }
515
+
516
+
517
+ # ══════════════════════════════════════���════════════════════════════════
518
+ # PRIVATE — Scoring functions
519
+ # ═══════════════════════════════════════════════════════════════════════
520
+
521
+ def _sigmoid(x: float, mid: float, steep: float) -> float:
522
+ z = steep * (x - mid)
523
+ z = max(-20.0, min(20.0, z))
524
+ return 1.0 / (1.0 + np.exp(-z))
525
+
526
+
527
+ def _score_pitch_stability(pitch_data: dict) -> float:
528
+ """
529
+ Low pitch jitter + low pitch std = unnaturally stable = AI-like.
530
+ Human singers: jitter ~10-25 cents, std ~50-150 cents.
531
+ AI singers: jitter ~2-8 cents, std ~10-40 cents.
532
+ """
533
+ if pitch_data["voiced_ratio"] < 0.1:
534
+ return 0.5
535
+
536
+ jitter_score = 1.0 - _sigmoid(pitch_data["pitch_jitter"], mid=12, steep=0.15)
537
+ std_score = 1.0 - _sigmoid(pitch_data["f0_std_cents"], mid=60, steep=0.03)
538
+
539
+ return jitter_score * 0.6 + std_score * 0.4
540
+
541
+
542
+ def _score_vibrato_regularity(vibrato_data: dict) -> float:
543
+ """
544
+ Very regular vibrato (high regularity value) = AI-like.
545
+ Human vibrato: regularity ~0.2-0.5
546
+ AI vibrato: regularity ~0.6-0.9
547
+ """
548
+ if vibrato_data["rate_hz"] < 1:
549
+ return 0.5 # No clear vibrato
550
+
551
+ reg_score = _sigmoid(vibrato_data["regularity"], mid=0.45, steep=6)
552
+ return float(reg_score)
553
+
554
+
555
+ def _score_formant_consistency(formant_data: dict) -> float:
556
+ """
557
+ Very stable formants (low formant_stability CV) = AI-like.
558
+ Human: CV ~0.08-0.20
559
+ AI: CV ~0.02-0.07
560
+ """
561
+ if formant_data["formant_stability"] == 0:
562
+ return 0.5
563
+
564
+ return float(1.0 - _sigmoid(formant_data["formant_stability"], mid=0.10, steep=15))
565
+
566
+
567
+ def _score_breath_patterns(breath_data: dict) -> float:
568
+ """
569
+ AI tends to either have no breaths or very regular breaths.
570
+ Very low breath count or very low breath_regularity variance = AI-like.
571
+ """
572
+ if breath_data["breath_count"] == 0:
573
+ return 0.7 # No breaths at all is suspicious
574
+
575
+ if breath_data["breath_count"] == 1:
576
+ return 0.5
577
+
578
+ # Very regular breathing (low CV) = AI-like
579
+ reg = breath_data["breath_regularity"]
580
+ return float(1.0 - _sigmoid(reg, mid=0.3, steep=5))
581
+
582
+
583
+ def _score_vocal_texture(texture_data: dict) -> float:
584
+ """
585
+ Very clean vocal texture (high harmonic ratio, low MFCC variance) = AI-like.
586
+ """
587
+ hr_score = _sigmoid(texture_data["vocal_harmonic_ratio"], mid=0.65, steep=6)
588
+ mfcc_score = 1.0 - _sigmoid(texture_data["mfcc_var"], mid=40, steep=0.04)
589
+ return float(hr_score * 0.5 + mfcc_score * 0.5)
590
+
591
+
592
+ # ═══════════════════════════════════════════════════════════════════════
593
+ # PRIVATE — Indicator text generation
594
+ # ═══════════════════════════════════════════════════════════════════════
595
+
596
+ def _build_vocal_indicators(
597
+ overall: float,
598
+ pitch: float,
599
+ vibrato: float,
600
+ formant: float,
601
+ breath: float,
602
+ pitch_data: dict,
603
+ ) -> list[str]:
604
+ """Generate human-readable vocal analysis indicators."""
605
+ indicators = []
606
+
607
+ if overall > 0.7:
608
+ indicators.append(
609
+ "Vocal patterns show strong synthetic characteristics."
610
+ )
611
+ elif overall > 0.5:
612
+ indicators.append(
613
+ "Vocal patterns show moderate synthetic indicators."
614
+ )
615
+ else:
616
+ indicators.append(
617
+ "Vocal patterns appear consistent with natural human singing."
618
+ )
619
+
620
+ if pitch > 0.7:
621
+ indicators.append(
622
+ f"Pitch stability is unusually high "
623
+ f"(jitter: {pitch_data['pitch_jitter']:.1f} cents). "
624
+ f"Human singers typically show more micro-variation."
625
+ )
626
+
627
+ if vibrato > 0.7:
628
+ indicators.append(
629
+ "Vibrato is mathematically regular, suggesting algorithmic generation."
630
+ )
631
+
632
+ if formant > 0.7:
633
+ indicators.append(
634
+ "Formant transitions are unnaturally consistent across frames."
635
+ )
636
+
637
+ if breath < 0.3:
638
+ indicators.append(
639
+ "Natural breath patterns detected between vocal phrases."
640
+ )
641
+ elif breath > 0.6:
642
+ indicators.append(
643
+ "Breath patterns are absent or overly regular."
644
+ )
645
+
646
+ return indicators