Spaces:
Sleeping
Sleeping
| """ | |
| Meta-classifier service for AURIS score fusion. | |
| Replaces the hand-tuned weighted-average fusion with | |
| a trained stacking ensemble that combines signals from | |
| all analysis towers. | |
| Towers: | |
| 1. wav2vec2 fine-tuned (logit + hidden stats) | |
| 2. Librosa features (spectral, temporal, harmonic) | |
| 3. Vocal analysis (pitch, vibrato, formant, breath) | |
| 4. CLAP embeddings (when available) | |
| 5. FST external (when available) | |
| The meta-classifier was trained on the same dataset | |
| with cross-validated tower outputs. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import pickle | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import List, Optional | |
| import numpy as np | |
| from .feature_extractor import AudioFeatures | |
| from .vocal_analyzer import VocalFeatures | |
| from .wav2vec2_detector import Wav2Vec2Result | |
| from .clap_detector import CLAPResult | |
| from .fst_client import FSTResult | |
| from .logging_config import get_logger | |
| logger = get_logger(__name__) | |
| _MODELS_DIR = Path(__file__).resolve().parents[2] / "models" | |
| class MetaResult: | |
| """Final detection result from meta-classifier.""" | |
| is_ai_generated: bool | |
| confidence: float | |
| model_version: str = "auris-v2-meta" | |
| decision_source: str = "auris_meta" | |
| analysis_mode: str = "production" | |
| # Per-tower scores for transparency | |
| tower_scores: dict = field(default_factory=dict) | |
| # Explainable indicators (SHAP-based when available) | |
| indicators: List[str] = field(default_factory=list) | |
| # Feature importances for this prediction | |
| top_features: List[dict] = field(default_factory=list) | |
| class MetaClassifierService: | |
| """ | |
| Trained stacking meta-classifier. | |
| Combines all tower outputs into a single feature vector, | |
| runs the trained classifier, and generates explainable | |
| indicators. | |
| Falls back to simple averaging when trained model is | |
| not available (during development before first training). | |
| """ | |
| def __init__(self) -> None: | |
| """Initialize meta-classifier with empty state.""" | |
| self._model = None | |
| self._scaler = None | |
| self._feature_cols: list[str] = [] | |
| self._initialized = False | |
| self._trained = False | |
| def _ensure_loaded(self) -> bool: | |
| """Load trained meta-classifier if available.""" | |
| if self._initialized: | |
| return self._trained | |
| self._initialized = True | |
| model_path = _MODELS_DIR / "auris_classifier_v1.pkl" | |
| scaler_path = _MODELS_DIR / "feature_scaler_v1.pkl" | |
| columns_path = _MODELS_DIR / "feature_columns_v1.json" | |
| if not model_path.exists(): | |
| logger.info( | |
| "Meta-classifier not found. " | |
| "Using fallback fusion." | |
| ) | |
| return False | |
| try: | |
| with open(model_path, "rb") as f: | |
| self._model = pickle.load(f) | |
| with open(scaler_path, "rb") as f: | |
| self._scaler = pickle.load(f) | |
| with open(columns_path, "r") as f: | |
| self._feature_cols = json.load(f) | |
| self._trained = True | |
| logger.info( | |
| f"Meta-classifier loaded: " | |
| f"{type(self._model).__name__}, " | |
| f"{len(self._feature_cols)} features" | |
| ) | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to load meta-classifier: {e}") | |
| return False | |
| def predict( | |
| self, | |
| features: AudioFeatures, | |
| vocals: Optional[VocalFeatures] = None, | |
| wav2vec2: Optional[Wav2Vec2Result] = None, | |
| clap: Optional[CLAPResult] = None, | |
| fst: Optional[FSTResult] = None, | |
| ) -> MetaResult: | |
| """ | |
| Run meta-classifier on all tower outputs. | |
| Args: | |
| features: Librosa-extracted audio features. | |
| vocals: Vocal analysis results. | |
| wav2vec2: wav2vec2 tower result. | |
| clap: CLAP embedding result. | |
| fst: FST external API result. | |
| Returns: | |
| MetaResult with final prediction and explanations. | |
| """ | |
| is_trained = self._ensure_loaded() | |
| # Collect per-tower scores for transparency | |
| tower_scores = {} | |
| if wav2vec2 and wav2vec2.available: | |
| tower_scores["wav2vec2"] = wav2vec2.p_ai | |
| if clap and clap.available: | |
| tower_scores["clap"] = clap.confidence | |
| if fst and fst.available: | |
| tower_scores["fst"] = fst.confidence | |
| # Local feature score (heuristic, used as fallback signal) | |
| local_score = ( | |
| features.spectral_regularity * 0.35 | |
| + features.temporal_patterns * 0.35 | |
| + features.harmonic_structure * 0.30 | |
| ) | |
| tower_scores["local_features"] = round(local_score, 4) | |
| if vocals and vocals.has_vocals: | |
| tower_scores["vocals"] = vocals.vocal_ai_score | |
| if is_trained: | |
| return self._predict_trained( | |
| features, vocals, wav2vec2, clap, fst, | |
| tower_scores, | |
| ) | |
| else: | |
| return self._predict_fallback( | |
| features, vocals, wav2vec2, clap, fst, | |
| tower_scores, | |
| ) | |
| def _predict_trained( | |
| self, | |
| features: AudioFeatures, | |
| vocals: Optional[VocalFeatures], | |
| wav2vec2: Optional[Wav2Vec2Result], | |
| clap: Optional[CLAPResult], | |
| fst: Optional[FSTResult], | |
| tower_scores: dict, | |
| ) -> MetaResult: | |
| """Prediction using trained meta-classifier.""" | |
| # Build feature vector matching training columns | |
| feat_dict = self._build_feature_dict( | |
| features, vocals, | |
| ) | |
| # Assemble in correct column order | |
| x = np.array([ | |
| feat_dict.get(col, 0.0) | |
| for col in self._feature_cols | |
| ], dtype=np.float32).reshape(1, -1) | |
| x = np.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0) | |
| x_scaled = self._scaler.transform(x) | |
| # Predict | |
| proba = self._model.predict_proba(x_scaled)[0] | |
| p_ai = float(proba[1]) | |
| # FST calibration (not in trained model) | |
| if fst and fst.available: | |
| tower_scores["fst"] = fst.confidence | |
| if (p_ai > 0.5) != fst.is_ai: | |
| # Disagreement — moderate confidence | |
| p_ai = p_ai * 0.85 + 0.5 * 0.15 | |
| is_ai = p_ai > 0.5 | |
| confidence = round(p_ai if is_ai else 1.0 - p_ai, 4) | |
| # Generate indicators | |
| indicators = self._build_indicators( | |
| is_ai, confidence, features, vocals, | |
| tower_scores, | |
| ) | |
| # Feature importances for this prediction | |
| top_features = self._get_top_features(x_scaled[0]) | |
| return MetaResult( | |
| is_ai_generated=is_ai, | |
| confidence=confidence, | |
| model_version="auris-v2-trained", | |
| decision_source="auris_meta", | |
| analysis_mode="production", | |
| tower_scores=tower_scores, | |
| indicators=indicators, | |
| top_features=top_features, | |
| ) | |
| def _predict_fallback( | |
| self, | |
| features: AudioFeatures, | |
| vocals: Optional[VocalFeatures], | |
| wav2vec2: Optional[Wav2Vec2Result], | |
| clap: Optional[CLAPResult], | |
| fst: Optional[FSTResult], | |
| tower_scores: dict, | |
| ) -> MetaResult: | |
| """ | |
| Fallback when trained model is not available. | |
| Uses weighted averaging of available tower scores. | |
| Better than heuristic-only but not data-driven. | |
| """ | |
| scores = [] | |
| weights = [] | |
| # wav2vec2 gets highest weight if available | |
| if wav2vec2 and wav2vec2.available: | |
| scores.append(wav2vec2.p_ai) | |
| weights.append(0.40) | |
| # Local features | |
| local = tower_scores.get("local_features", 0.5) | |
| scores.append(local) | |
| weights.append(0.25 if wav2vec2 and wav2vec2.available else 0.45) | |
| # Vocals | |
| if vocals and vocals.has_vocals: | |
| scores.append(vocals.vocal_ai_score) | |
| weights.append(0.15) | |
| # CLAP | |
| if clap and clap.available: | |
| scores.append(clap.confidence) | |
| weights.append(0.10) | |
| # FST | |
| if fst and fst.available: | |
| scores.append(fst.confidence) | |
| weights.append(0.20) | |
| # Weighted average | |
| total_w = sum(weights) | |
| p_ai = sum( | |
| s * (w / total_w) for s, w in zip(scores, weights) | |
| ) | |
| is_ai = p_ai > 0.5 | |
| confidence = round(max(0.51, min(0.97, p_ai)), 4) | |
| indicators = self._build_indicators( | |
| is_ai, confidence, features, vocals, | |
| tower_scores, | |
| ) | |
| return MetaResult( | |
| is_ai_generated=is_ai, | |
| confidence=confidence, | |
| model_version="auris-v1-heuristic", | |
| decision_source="auris_fallback", | |
| analysis_mode="production", | |
| tower_scores=tower_scores, | |
| indicators=indicators, | |
| ) | |
| def _build_feature_dict( | |
| self, | |
| features: AudioFeatures, | |
| vocals: Optional[VocalFeatures], | |
| ) -> dict: | |
| """Build flat feature dict for meta-classifier.""" | |
| d = { | |
| "duration_sec": features.duration_sec, | |
| "sample_rate": features.sample_rate, | |
| "rms_energy": features.rms_energy, | |
| "tempo_bpm": features.tempo_bpm, | |
| "tempo_stability": features.tempo_stability, | |
| "spectral_centroid_mean": features.spectral_centroid_mean, | |
| "spectral_centroid_std": features.spectral_centroid_std, | |
| "spectral_flatness_mean": features.spectral_flatness_mean, | |
| "mfcc_variance": features.mfcc_variance, | |
| "chroma_entropy": features.chroma_entropy, | |
| "harmonic_ratio": features.harmonic_ratio, | |
| "zero_crossing_rate": features.zero_crossing_rate, | |
| "spectral_regularity": features.spectral_regularity, | |
| "temporal_patterns": features.temporal_patterns, | |
| "harmonic_structure": features.harmonic_structure, | |
| } | |
| if vocals: | |
| d.update({ | |
| "has_vocals": 1.0 if vocals.has_vocals else 0.0, | |
| "vocal_confidence": vocals.vocal_confidence, | |
| "vocal_ai_score": vocals.vocal_ai_score, | |
| "pitch_stability_score": vocals.pitch_stability_score, | |
| "vibrato_regularity_score": vocals.vibrato_regularity_score, | |
| "formant_consistency_score": vocals.formant_consistency_score, | |
| "breath_pattern_score": vocals.breath_pattern_score, | |
| "vocal_texture_score": vocals.vocal_texture_score, | |
| "pitch_mean_hz": vocals.pitch_mean_hz, | |
| "pitch_std_cents": vocals.pitch_std_cents, | |
| "vibrato_rate_hz": vocals.vibrato_rate_hz, | |
| "vibrato_extent_cents": vocals.vibrato_extent_cents, | |
| "vocal_harmonic_ratio": vocals.vocal_harmonic_ratio, | |
| "vocal_energy_ratio": vocals.vocal_energy_ratio, | |
| }) | |
| else: | |
| for key in [ | |
| "has_vocals", "vocal_confidence", "vocal_ai_score", | |
| "pitch_stability_score", "vibrato_regularity_score", | |
| "formant_consistency_score", "breath_pattern_score", | |
| "vocal_texture_score", "pitch_mean_hz", | |
| "pitch_std_cents", "vibrato_rate_hz", | |
| "vibrato_extent_cents", "vocal_harmonic_ratio", | |
| "vocal_energy_ratio", | |
| ]: | |
| d[key] = 0.0 | |
| return d | |
| def _get_top_features( | |
| self, x: np.ndarray, top_n: int = 5 | |
| ) -> list[dict]: | |
| """ | |
| Get top contributing features for this prediction. | |
| Uses feature_importances_ from tree models. | |
| In future: SHAP values for per-sample explanation. | |
| """ | |
| if not hasattr(self._model, "feature_importances_"): | |
| return [] | |
| importances = self._model.feature_importances_ | |
| indices = np.argsort(importances)[::-1][:top_n] | |
| result = [] | |
| for idx in indices: | |
| col_name = ( | |
| self._feature_cols[idx] | |
| if idx < len(self._feature_cols) | |
| else f"feature_{idx}" | |
| ) | |
| result.append({ | |
| "feature": col_name, | |
| "importance": round(float(importances[idx]), 4), | |
| "value": round(float(x[idx]), 4), | |
| }) | |
| return result | |
| def _build_indicators( | |
| is_ai: bool, | |
| confidence: float, | |
| features: AudioFeatures, | |
| vocals: Optional[VocalFeatures], | |
| tower_scores: dict, | |
| ) -> list[str]: | |
| """Generate human-readable indicators.""" | |
| indicators = [] | |
| # Overall | |
| label = "AI-generated" if is_ai else "human-composed" | |
| if confidence > 0.85: | |
| indicators.append( | |
| f"High confidence: classified as {label}." | |
| ) | |
| elif confidence > 0.70: | |
| indicators.append( | |
| f"Moderate confidence: likely {label}." | |
| ) | |
| else: | |
| indicators.append( | |
| f"Low confidence: borderline {label}." | |
| ) | |
| # Tower agreement | |
| ai_towers = sum( | |
| 1 for v in tower_scores.values() if v > 0.5 | |
| ) | |
| human_towers = sum( | |
| 1 for v in tower_scores.values() if v <= 0.5 | |
| ) | |
| total = ai_towers + human_towers | |
| if total > 1: | |
| if ai_towers == total: | |
| indicators.append( | |
| f"All {total} analysis signals agree: AI-generated." | |
| ) | |
| elif human_towers == total: | |
| indicators.append( | |
| f"All {total} analysis signals agree: human-composed." | |
| ) | |
| else: | |
| indicators.append( | |
| f"Mixed signals: {ai_towers}/{total} indicate AI, " | |
| f"{human_towers}/{total} indicate human." | |
| ) | |
| # Spectral | |
| if features.spectral_regularity > 0.7: | |
| indicators.append( | |
| "High spectral regularity — typical of AI synthesis." | |
| ) | |
| elif features.spectral_regularity < 0.3: | |
| indicators.append( | |
| "Natural spectral variation — consistent with human recording." | |
| ) | |
| # Temporal | |
| if features.temporal_patterns > 0.7: | |
| indicators.append( | |
| f"Metronomic timing precision " | |
| f"(tempo jitter: {features.tempo_stability:.3f}s)." | |
| ) | |
| # Vocals | |
| if vocals and vocals.has_vocals: | |
| if vocals.vocal_ai_score > 0.7: | |
| indicators.append( | |
| "Vocal analysis indicates synthetic voice characteristics." | |
| ) | |
| elif vocals.vocal_ai_score < 0.3: | |
| indicators.append( | |
| "Vocal patterns consistent with natural human singing." | |
| ) | |
| if vocals.pitch_std_cents < 10: | |
| indicators.append( | |
| f"Pitch jitter ({vocals.pitch_std_cents:.1f} cents) " | |
| "is unusually low — suggests synthetic vocal." | |
| ) | |
| return indicators | |