Spaces:
Sleeping
Sleeping
| """ | |
| Vocal analysis for AI music detection. | |
| Separates vocals from instruments and analyzes vocal characteristics | |
| that distinguish AI-generated singing from real human vocals. | |
| Key detection signals: | |
| - Formant consistency (AI has unnaturally smooth or irregular formants) | |
| - Pitch micro-variation (humans have 5-20 cent natural jitter) | |
| - Breath patterns (AI either omits or over-regularizes breath sounds) | |
| - Vibrato regularity (AI vibrato is mathematically perfect) | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import subprocess | |
| import tempfile | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional, Union | |
| import numpy as np | |
| import librosa | |
| from .logging_config import get_logger | |
| logger = get_logger(__name__) | |
| # ── Constants ──────────────────────────────────────────────────────────── | |
| _TARGET_SR = 22050 | |
| _HOP_LENGTH = 512 | |
| _DURATION_LIMIT = 60.0 | |
| _MIN_VOCAL_ENERGY = 1e-5 # Threshold for "vocals present" | |
| class VocalFeatures: | |
| """Vocal-specific analysis results.""" | |
| has_vocals: bool | |
| vocal_confidence: float # 0.0-1.0, how confident we are vocals exist | |
| vocal_ai_score: float # 0.0-1.0, overall vocal AI likelihood | |
| # Sub-scores | |
| pitch_stability_score: float # High = unnaturally stable = AI-like | |
| vibrato_regularity_score: float | |
| formant_consistency_score: float | |
| breath_pattern_score: float | |
| vocal_texture_score: float | |
| # Raw metrics | |
| pitch_mean_hz: float | |
| pitch_std_cents: float # Standard deviation of pitch in cents | |
| vibrato_rate_hz: float | |
| vibrato_extent_cents: float | |
| vocal_harmonic_ratio: float | |
| vocal_energy_ratio: float # vocal energy / total energy | |
| indicators: list[str] = field(default_factory=list) | |
| def analyze_vocals( | |
| source: Union[Path, bytes, io.BytesIO], | |
| *, | |
| sr: Optional[int] = None, | |
| ) -> VocalFeatures: | |
| """ | |
| Analyze vocal characteristics of an audio source. | |
| Uses harmonic-percussive-vocal separation and pitch tracking | |
| to identify AI-generated vocal patterns. | |
| Args: | |
| source: Audio file path, bytes, or BytesIO. | |
| sr: Target sample rate. | |
| Returns: | |
| VocalFeatures with scores and raw metrics. | |
| """ | |
| target_sr = sr or _TARGET_SR | |
| y, actual_sr = _load_audio(source, target_sr) | |
| duration_sec = len(y) / actual_sr | |
| logger.info(f"Vocal analysis: {duration_sec:.1f}s audio @ {actual_sr}Hz") | |
| # ── Step 1: Separate vocals from accompaniment ─────────────────── | |
| y_vocal, y_accompaniment = _separate_vocals(y, actual_sr) | |
| # ── Step 2: Check if vocals are present ────────────────────────── | |
| vocal_energy = float(np.sum(y_vocal ** 2)) | |
| total_energy = float(np.sum(y ** 2)) | |
| vocal_energy_ratio = vocal_energy / (total_energy + 1e-10) | |
| has_vocals = vocal_energy_ratio > 0.05 # At least 5% vocal energy | |
| if not has_vocals: | |
| logger.info("No significant vocals detected") | |
| return VocalFeatures( | |
| has_vocals=False, | |
| vocal_confidence=vocal_energy_ratio, | |
| vocal_ai_score=0.0, | |
| pitch_stability_score=0.0, | |
| vibrato_regularity_score=0.0, | |
| formant_consistency_score=0.0, | |
| breath_pattern_score=0.0, | |
| vocal_texture_score=0.0, | |
| pitch_mean_hz=0.0, | |
| pitch_std_cents=0.0, | |
| vibrato_rate_hz=0.0, | |
| vibrato_extent_cents=0.0, | |
| vocal_harmonic_ratio=0.0, | |
| vocal_energy_ratio=vocal_energy_ratio, | |
| indicators=["No significant vocal content detected in audio."], | |
| ) | |
| # ── Step 3: Pitch tracking on vocal ────────────────────────────── | |
| pitch_data = _analyze_pitch(y_vocal, actual_sr) | |
| # ── Step 4: Vibrato analysis ───────────────────────────────────── | |
| vibrato_data = _analyze_vibrato(pitch_data["f0_hz"], actual_sr) | |
| # ── Step 5: Formant analysis (via spectral envelope) ───────────── | |
| formant_data = _analyze_formants(y_vocal, actual_sr) | |
| # ── Step 6: Breath / micro-silence detection ───────────────────── | |
| breath_data = _analyze_breath_patterns(y_vocal, actual_sr) | |
| # ── Step 7: Vocal texture (harmonic richness of vocal) ─────────── | |
| texture_data = _analyze_vocal_texture(y_vocal, actual_sr) | |
| # ── Step 8: Compute sub-scores ─────────────────────────────────── | |
| pitch_score = _score_pitch_stability(pitch_data) | |
| vibrato_score = _score_vibrato_regularity(vibrato_data) | |
| formant_score = _score_formant_consistency(formant_data) | |
| breath_score = _score_breath_patterns(breath_data) | |
| texture_score = _score_vocal_texture(texture_data) | |
| # ── Step 9: Overall vocal AI score ─────────────────────────────── | |
| vocal_ai_score = ( | |
| pitch_score * 0.25 | |
| + vibrato_score * 0.20 | |
| + formant_score * 0.25 | |
| + breath_score * 0.15 | |
| + texture_score * 0.15 | |
| ) | |
| vocal_ai_score = round(max(0.0, min(0.99, vocal_ai_score)), 3) | |
| # ── Step 10: Build indicators ──────────────────────────────────── | |
| indicators = _build_vocal_indicators( | |
| vocal_ai_score, pitch_score, vibrato_score, | |
| formant_score, breath_score, pitch_data | |
| ) | |
| return VocalFeatures( | |
| has_vocals=True, | |
| vocal_confidence=min(1.0, vocal_energy_ratio * 5), | |
| vocal_ai_score=vocal_ai_score, | |
| pitch_stability_score=round(pitch_score, 3), | |
| vibrato_regularity_score=round(vibrato_score, 3), | |
| formant_consistency_score=round(formant_score, 3), | |
| breath_pattern_score=round(breath_score, 3), | |
| vocal_texture_score=round(texture_score, 3), | |
| pitch_mean_hz=pitch_data["f0_mean"], | |
| pitch_std_cents=pitch_data["f0_std_cents"], | |
| vibrato_rate_hz=vibrato_data["rate_hz"], | |
| vibrato_extent_cents=vibrato_data["extent_cents"], | |
| vocal_harmonic_ratio=texture_data["vocal_harmonic_ratio"], | |
| vocal_energy_ratio=vocal_energy_ratio, | |
| indicators=indicators, | |
| ) | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Audio loading | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _ffmpeg_decode(data: bytes) -> io.BytesIO: | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
| tmp_path = tmp.name | |
| try: | |
| result = subprocess.run( | |
| ["ffmpeg", "-y", "-i", "pipe:0", "-ar", "22050", "-ac", "1", "-f", "wav", tmp_path], | |
| input=data, capture_output=True, timeout=30, | |
| ) | |
| if result.returncode != 0: | |
| raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}") | |
| with open(tmp_path, "rb") as f: | |
| return io.BytesIO(f.read()) | |
| finally: | |
| Path(tmp_path).unlink(missing_ok=True) | |
| def _load_audio( | |
| source: Union[Path, bytes, io.BytesIO], target_sr: int | |
| ) -> tuple[np.ndarray, int]: | |
| if isinstance(source, bytes): | |
| source = io.BytesIO(source) | |
| if isinstance(source, io.BytesIO): | |
| raw_bytes = source.read() | |
| source = io.BytesIO(raw_bytes) | |
| else: | |
| raw_bytes = None | |
| try: | |
| y, sr = librosa.load(source, sr=target_sr, mono=True, duration=_DURATION_LIMIT) | |
| except Exception: | |
| if raw_bytes is None: | |
| raise | |
| y, sr = librosa.load(_ffmpeg_decode(raw_bytes), sr=target_sr, mono=True, duration=_DURATION_LIMIT) | |
| if len(y) < sr: | |
| raise ValueError("Audio too short for vocal analysis (< 1s)") | |
| return y, sr | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Vocal separation | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _separate_vocals(y: np.ndarray, sr: int) -> tuple[np.ndarray, np.ndarray]: | |
| """ | |
| Separate vocals from accompaniment using harmonic-percussive | |
| source separation with spectral masking. | |
| This is a lightweight alternative to Demucs/Spleeter that works | |
| without GPU or large model downloads. For production, replace | |
| with Demucs for better quality. | |
| """ | |
| # HPSS to get harmonic component (vocals + melodic instruments) | |
| y_harmonic, y_percussive = librosa.effects.hpss(y, margin=3.0) | |
| # Use spectral masking to isolate vocal frequency range (80Hz-4kHz) | |
| S = librosa.stft(y_harmonic, n_fft=2048, hop_length=_HOP_LENGTH) | |
| freqs = librosa.fft_frequencies(sr=sr, n_fft=2048) | |
| # Vocal frequency mask | |
| vocal_mask = np.zeros_like(freqs) | |
| vocal_range = (freqs >= 80) & (freqs <= 4000) | |
| vocal_mask[vocal_range] = 1.0 | |
| # Smooth the mask edges | |
| from scipy.ndimage import gaussian_filter1d | |
| vocal_mask = gaussian_filter1d(vocal_mask, sigma=3) | |
| # Apply mask | |
| S_vocal = S * vocal_mask[:, np.newaxis] | |
| S_accomp = S * (1.0 - vocal_mask[:, np.newaxis]) | |
| y_vocal = librosa.istft(S_vocal, hop_length=_HOP_LENGTH, length=len(y)) | |
| y_accomp = librosa.istft(S_accomp, hop_length=_HOP_LENGTH, length=len(y)) | |
| return y_vocal, y_accomp | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Pitch analysis | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _analyze_pitch(y_vocal: np.ndarray, sr: int) -> dict: | |
| """Extract pitch (f0) from vocal signal using pyin.""" | |
| f0, voiced_flag, voiced_probs = librosa.pyin( | |
| y_vocal, | |
| fmin=librosa.note_to_hz('C2'), # ~65 Hz | |
| fmax=librosa.note_to_hz('C7'), # ~2093 Hz | |
| sr=sr, | |
| hop_length=_HOP_LENGTH, | |
| ) | |
| # Filter to voiced frames only | |
| voiced_f0 = f0[voiced_flag] | |
| if len(voiced_f0) < 10: | |
| return { | |
| "f0_hz": f0, | |
| "f0_mean": 0.0, | |
| "f0_std_hz": 0.0, | |
| "f0_std_cents": 0.0, | |
| "voiced_ratio": 0.0, | |
| "pitch_jitter": 0.0, | |
| "pitch_range_semitones": 0.0, | |
| } | |
| f0_mean = float(np.mean(voiced_f0)) | |
| f0_std = float(np.std(voiced_f0)) | |
| # Convert to cents for perceptual accuracy | |
| # 1 cent = 1/100 of a semitone | |
| cents = 1200 * np.log2(voiced_f0 / f0_mean) | |
| f0_std_cents = float(np.std(cents)) | |
| # Pitch jitter — frame-to-frame pitch variation | |
| if len(voiced_f0) > 1: | |
| jitter_cents = 1200 * np.abs(np.log2(voiced_f0[1:] / voiced_f0[:-1])) | |
| pitch_jitter = float(np.mean(jitter_cents)) | |
| else: | |
| pitch_jitter = 0.0 | |
| # Pitch range in semitones | |
| pitch_range = float(12 * np.log2(np.max(voiced_f0) / np.min(voiced_f0))) | |
| voiced_ratio = float(np.sum(voiced_flag) / len(voiced_flag)) | |
| return { | |
| "f0_hz": f0, | |
| "f0_mean": f0_mean, | |
| "f0_std_hz": f0_std, | |
| "f0_std_cents": f0_std_cents, | |
| "voiced_ratio": voiced_ratio, | |
| "pitch_jitter": pitch_jitter, | |
| "pitch_range_semitones": pitch_range, | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Vibrato analysis | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _analyze_vibrato(f0_hz: np.ndarray, sr: int) -> dict: | |
| """Analyze vibrato characteristics from pitch contour.""" | |
| voiced = f0_hz[~np.isnan(f0_hz)] | |
| if len(voiced) < 20: | |
| return { | |
| "rate_hz": 0.0, | |
| "extent_cents": 0.0, | |
| "regularity": 0.0, | |
| } | |
| # Detrend pitch to isolate oscillation | |
| from scipy.signal import detrend | |
| detrended = detrend(voiced) | |
| # Convert to cents | |
| mean_f0 = np.mean(voiced) | |
| if mean_f0 < 1: | |
| return {"rate_hz": 0.0, "extent_cents": 0.0, "regularity": 0.0} | |
| cents_deviation = 1200 * np.log2((voiced) / mean_f0) | |
| cents_detrended = detrend(cents_deviation) | |
| # FFT to find vibrato rate | |
| hop_rate = sr / _HOP_LENGTH # frames per second | |
| fft = np.abs(np.fft.rfft(cents_detrended)) | |
| freqs = np.fft.rfftfreq(len(cents_detrended), d=1.0 / hop_rate) | |
| # Vibrato typically 4-8 Hz | |
| vibrato_range = (freqs >= 3) & (freqs <= 10) | |
| if not np.any(vibrato_range): | |
| return {"rate_hz": 0.0, "extent_cents": 0.0, "regularity": 0.0} | |
| fft_vibrato = fft.copy() | |
| fft_vibrato[~vibrato_range] = 0 | |
| peak_idx = np.argmax(fft_vibrato) | |
| vibrato_rate = float(freqs[peak_idx]) | |
| vibrato_power = float(fft[peak_idx]) | |
| # Extent — average deviation in cents | |
| extent_cents = float(np.std(cents_detrended)) * 2 # ~peak-to-peak | |
| # Regularity — how periodic is the vibrato | |
| total_power = float(np.sum(fft[vibrato_range] ** 2)) | |
| peak_power = float(fft[peak_idx] ** 2) | |
| regularity = peak_power / (total_power + 1e-10) | |
| return { | |
| "rate_hz": vibrato_rate, | |
| "extent_cents": extent_cents, | |
| "regularity": float(regularity), | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Formant analysis | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _analyze_formants(y_vocal: np.ndarray, sr: int) -> dict: | |
| """ | |
| Analyze formant consistency via spectral envelope. | |
| Uses LPC (Linear Predictive Coding) to estimate formant | |
| frequencies and tracks their stability over time. | |
| """ | |
| frame_length = 2048 | |
| hop = _HOP_LENGTH | |
| n_frames = (len(y_vocal) - frame_length) // hop | |
| if n_frames < 5: | |
| return {"f1_std": 0.0, "f2_std": 0.0, "formant_stability": 0.0} | |
| formant_tracks = {1: [], 2: [], 3: []} | |
| for i in range(min(n_frames, 200)): # Limit to 200 frames | |
| start = i * hop | |
| frame = y_vocal[start: start + frame_length] | |
| if np.max(np.abs(frame)) < 1e-6: | |
| continue | |
| # Apply window | |
| frame = frame * np.hamming(len(frame)) | |
| # LPC analysis (order 12-16 works well for formants) | |
| try: | |
| lpc_order = min(16, len(frame) - 1) | |
| a = librosa.lpc(frame, order=lpc_order) | |
| # Find formant frequencies from LPC roots | |
| roots = np.roots(a) | |
| roots = roots[np.imag(roots) >= 0] # Keep positive frequencies | |
| angles = np.angle(roots) | |
| freqs_hz = angles * (sr / (2 * np.pi)) | |
| # Filter to reasonable formant ranges | |
| formants = sorted(f for f in freqs_hz if 200 < f < 5000) | |
| if len(formants) >= 3: | |
| formant_tracks[1].append(formants[0]) | |
| formant_tracks[2].append(formants[1]) | |
| formant_tracks[3].append(formants[2]) | |
| except Exception: | |
| continue | |
| if len(formant_tracks[1]) < 5: | |
| return {"f1_std": 0.0, "f2_std": 0.0, "formant_stability": 0.0} | |
| f1_std = float(np.std(formant_tracks[1])) | |
| f2_std = float(np.std(formant_tracks[2])) | |
| # Formant stability: lower = more stable = potentially more AI-like | |
| formant_stability = float(np.mean([ | |
| np.std(formant_tracks[1]) / (np.mean(formant_tracks[1]) + 1e-10), | |
| np.std(formant_tracks[2]) / (np.mean(formant_tracks[2]) + 1e-10), | |
| ])) | |
| return { | |
| "f1_std": f1_std, | |
| "f2_std": f2_std, | |
| "formant_stability": formant_stability, | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Breath pattern analysis | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _analyze_breath_patterns(y_vocal: np.ndarray, sr: int) -> dict: | |
| """ | |
| Detect breath-like sounds and silence patterns. | |
| Human singers have irregular breath sounds between phrases. | |
| AI either omits them or produces unnaturally regular patterns. | |
| """ | |
| # RMS energy envelope | |
| rms = librosa.feature.rms(y=y_vocal, hop_length=_HOP_LENGTH)[0] | |
| # Silence threshold (relative) | |
| silence_thresh = np.mean(rms) * 0.15 | |
| # Find silence segments (potential breath locations) | |
| is_quiet = rms < silence_thresh | |
| quiet_segments = _find_segments(is_quiet) | |
| # Filter to breath-like durations (0.1s - 1.0s) | |
| hop_sec = _HOP_LENGTH / sr | |
| breath_like = [ | |
| seg for seg in quiet_segments | |
| if 0.1 <= seg["duration"] * hop_sec <= 1.0 | |
| ] | |
| breath_count = len(breath_like) | |
| if breath_count < 2: | |
| return { | |
| "breath_count": breath_count, | |
| "breath_regularity": 0.0, | |
| "breath_density": 0.0, | |
| } | |
| # Inter-breath intervals | |
| breath_starts = [seg["start"] * hop_sec for seg in breath_like] | |
| ibi = np.diff(breath_starts) | |
| breath_regularity = float(np.std(ibi) / (np.mean(ibi) + 1e-10)) | |
| duration_sec = len(y_vocal) / sr | |
| breath_density = breath_count / duration_sec | |
| return { | |
| "breath_count": breath_count, | |
| "breath_regularity": breath_regularity, | |
| "breath_density": breath_density, | |
| } | |
| def _find_segments(mask: np.ndarray) -> list[dict]: | |
| """Find contiguous True segments in a boolean array.""" | |
| segments = [] | |
| in_segment = False | |
| start = 0 | |
| for i, val in enumerate(mask): | |
| if val and not in_segment: | |
| start = i | |
| in_segment = True | |
| elif not val and in_segment: | |
| segments.append({"start": start, "duration": i - start}) | |
| in_segment = False | |
| if in_segment: | |
| segments.append({"start": start, "duration": len(mask) - start}) | |
| return segments | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Vocal texture | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _analyze_vocal_texture(y_vocal: np.ndarray, sr: int) -> dict: | |
| """Analyze the harmonic richness and texture of the vocal.""" | |
| y_h, y_p = librosa.effects.hpss(y_vocal) | |
| h_energy = float(np.sum(y_h ** 2)) | |
| total = float(np.sum(y_vocal ** 2)) | |
| vocal_harmonic_ratio = h_energy / (total + 1e-10) | |
| # Spectral roll-off — where 85% of energy is below | |
| rolloff = librosa.feature.spectral_rolloff( | |
| y=y_vocal, sr=sr, hop_length=_HOP_LENGTH, roll_percent=0.85 | |
| )[0] | |
| rolloff_std = float(np.std(rolloff)) | |
| rolloff_mean = float(np.mean(rolloff)) | |
| # MFCC variance on vocal | |
| mfcc = librosa.feature.mfcc(y=y_vocal, sr=sr, n_mfcc=13, hop_length=_HOP_LENGTH) | |
| mfcc_var = float(np.mean(np.var(mfcc, axis=1))) | |
| return { | |
| "vocal_harmonic_ratio": vocal_harmonic_ratio, | |
| "rolloff_std": rolloff_std, | |
| "rolloff_mean": rolloff_mean, | |
| "mfcc_var": mfcc_var, | |
| } | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Scoring functions | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _sigmoid(x: float, mid: float, steep: float) -> float: | |
| z = steep * (x - mid) | |
| z = max(-20.0, min(20.0, z)) | |
| return 1.0 / (1.0 + np.exp(-z)) | |
| def _score_pitch_stability(pitch_data: dict) -> float: | |
| """ | |
| Low pitch jitter + low pitch std = unnaturally stable = AI-like. | |
| Human singers: jitter ~10-25 cents, std ~50-150 cents. | |
| AI singers: jitter ~2-8 cents, std ~10-40 cents. | |
| """ | |
| if pitch_data["voiced_ratio"] < 0.1: | |
| return 0.5 | |
| jitter_score = 1.0 - _sigmoid(pitch_data["pitch_jitter"], mid=12, steep=0.15) | |
| std_score = 1.0 - _sigmoid(pitch_data["f0_std_cents"], mid=60, steep=0.03) | |
| return jitter_score * 0.6 + std_score * 0.4 | |
| def _score_vibrato_regularity(vibrato_data: dict) -> float: | |
| """ | |
| Very regular vibrato (high regularity value) = AI-like. | |
| Human vibrato: regularity ~0.2-0.5 | |
| AI vibrato: regularity ~0.6-0.9 | |
| """ | |
| if vibrato_data["rate_hz"] < 1: | |
| return 0.5 # No clear vibrato | |
| reg_score = _sigmoid(vibrato_data["regularity"], mid=0.45, steep=6) | |
| return float(reg_score) | |
| def _score_formant_consistency(formant_data: dict) -> float: | |
| """ | |
| Very stable formants (low formant_stability CV) = AI-like. | |
| Human: CV ~0.08-0.20 | |
| AI: CV ~0.02-0.07 | |
| """ | |
| if formant_data["formant_stability"] == 0: | |
| return 0.5 | |
| return float(1.0 - _sigmoid(formant_data["formant_stability"], mid=0.10, steep=15)) | |
| def _score_breath_patterns(breath_data: dict) -> float: | |
| """ | |
| AI tends to either have no breaths or very regular breaths. | |
| Very low breath count or very low breath_regularity variance = AI-like. | |
| """ | |
| if breath_data["breath_count"] == 0: | |
| return 0.7 # No breaths at all is suspicious | |
| if breath_data["breath_count"] == 1: | |
| return 0.5 | |
| # Very regular breathing (low CV) = AI-like | |
| reg = breath_data["breath_regularity"] | |
| return float(1.0 - _sigmoid(reg, mid=0.3, steep=5)) | |
| def _score_vocal_texture(texture_data: dict) -> float: | |
| """ | |
| Very clean vocal texture (high harmonic ratio, low MFCC variance) = AI-like. | |
| """ | |
| hr_score = _sigmoid(texture_data["vocal_harmonic_ratio"], mid=0.65, steep=6) | |
| mfcc_score = 1.0 - _sigmoid(texture_data["mfcc_var"], mid=40, steep=0.04) | |
| return float(hr_score * 0.5 + mfcc_score * 0.5) | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| # PRIVATE — Indicator text generation | |
| # ═══════════════════════════════════════════════════════════════════════ | |
| def _build_vocal_indicators( | |
| overall: float, | |
| pitch: float, | |
| vibrato: float, | |
| formant: float, | |
| breath: float, | |
| pitch_data: dict, | |
| ) -> list[str]: | |
| """Generate human-readable vocal analysis indicators.""" | |
| indicators = [] | |
| if overall > 0.7: | |
| indicators.append( | |
| "Vocal patterns show strong synthetic characteristics." | |
| ) | |
| elif overall > 0.5: | |
| indicators.append( | |
| "Vocal patterns show moderate synthetic indicators." | |
| ) | |
| else: | |
| indicators.append( | |
| "Vocal patterns appear consistent with natural human singing." | |
| ) | |
| if pitch > 0.7: | |
| indicators.append( | |
| f"Pitch stability is unusually high " | |
| f"(jitter: {pitch_data['pitch_jitter']:.1f} cents). " | |
| f"Human singers typically show more micro-variation." | |
| ) | |
| if vibrato > 0.7: | |
| indicators.append( | |
| "Vibrato is mathematically regular, suggesting algorithmic generation." | |
| ) | |
| if formant > 0.7: | |
| indicators.append( | |
| "Formant transitions are unnaturally consistent across frames." | |
| ) | |
| if breath < 0.3: | |
| indicators.append( | |
| "Natural breath patterns detected between vocal phrases." | |
| ) | |
| elif breath > 0.6: | |
| indicators.append( | |
| "Breath patterns are absent or overly regular." | |
| ) | |
| return indicators | |