crowncode-backend / app /services /meta_classifier.py
Rthur2003's picture
fix: add docstring to MetaClassifierService constructor for clarity
b018523
"""
Meta-classifier service for AURIS score fusion.
Replaces the hand-tuned weighted-average fusion with
a trained stacking ensemble that combines signals from
all analysis towers.
Towers:
1. wav2vec2 fine-tuned (logit + hidden stats)
2. Librosa features (spectral, temporal, harmonic)
3. Vocal analysis (pitch, vibrato, formant, breath)
4. CLAP embeddings (when available)
5. FST external (when available)
The meta-classifier was trained on the same dataset
with cross-validated tower outputs.
"""
from __future__ import annotations
import json
import pickle
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional
import numpy as np
from .feature_extractor import AudioFeatures
from .vocal_analyzer import VocalFeatures
from .wav2vec2_detector import Wav2Vec2Result
from .clap_detector import CLAPResult
from .fst_client import FSTResult
from .logging_config import get_logger
logger = get_logger(__name__)
_MODELS_DIR = Path(__file__).resolve().parents[2] / "models"
@dataclass
class MetaResult:
"""Final detection result from meta-classifier."""
is_ai_generated: bool
confidence: float
model_version: str = "auris-v2-meta"
decision_source: str = "auris_meta"
analysis_mode: str = "production"
# Per-tower scores for transparency
tower_scores: dict = field(default_factory=dict)
# Explainable indicators (SHAP-based when available)
indicators: List[str] = field(default_factory=list)
# Feature importances for this prediction
top_features: List[dict] = field(default_factory=list)
class MetaClassifierService:
"""
Trained stacking meta-classifier.
Combines all tower outputs into a single feature vector,
runs the trained classifier, and generates explainable
indicators.
Falls back to simple averaging when trained model is
not available (during development before first training).
"""
def __init__(self) -> None:
"""Initialize meta-classifier with empty state."""
self._model = None
self._scaler = None
self._feature_cols: list[str] = []
self._initialized = False
self._trained = False
def _ensure_loaded(self) -> bool:
"""Load trained meta-classifier if available."""
if self._initialized:
return self._trained
self._initialized = True
model_path = _MODELS_DIR / "auris_classifier_v1.pkl"
scaler_path = _MODELS_DIR / "feature_scaler_v1.pkl"
columns_path = _MODELS_DIR / "feature_columns_v1.json"
if not model_path.exists():
logger.info(
"Meta-classifier not found. "
"Using fallback fusion."
)
return False
try:
with open(model_path, "rb") as f:
self._model = pickle.load(f)
with open(scaler_path, "rb") as f:
self._scaler = pickle.load(f)
with open(columns_path, "r") as f:
self._feature_cols = json.load(f)
self._trained = True
logger.info(
f"Meta-classifier loaded: "
f"{type(self._model).__name__}, "
f"{len(self._feature_cols)} features"
)
return True
except Exception as e:
logger.error(f"Failed to load meta-classifier: {e}")
return False
def predict(
self,
features: AudioFeatures,
vocals: Optional[VocalFeatures] = None,
wav2vec2: Optional[Wav2Vec2Result] = None,
clap: Optional[CLAPResult] = None,
fst: Optional[FSTResult] = None,
) -> MetaResult:
"""
Run meta-classifier on all tower outputs.
Args:
features: Librosa-extracted audio features.
vocals: Vocal analysis results.
wav2vec2: wav2vec2 tower result.
clap: CLAP embedding result.
fst: FST external API result.
Returns:
MetaResult with final prediction and explanations.
"""
is_trained = self._ensure_loaded()
# Collect per-tower scores for transparency
tower_scores = {}
if wav2vec2 and wav2vec2.available:
tower_scores["wav2vec2"] = wav2vec2.p_ai
if clap and clap.available:
tower_scores["clap"] = clap.confidence
if fst and fst.available:
tower_scores["fst"] = fst.confidence
# Local feature score (heuristic, used as fallback signal)
local_score = (
features.spectral_regularity * 0.35
+ features.temporal_patterns * 0.35
+ features.harmonic_structure * 0.30
)
tower_scores["local_features"] = round(local_score, 4)
if vocals and vocals.has_vocals:
tower_scores["vocals"] = vocals.vocal_ai_score
if is_trained:
return self._predict_trained(
features, vocals, wav2vec2, clap, fst,
tower_scores,
)
else:
return self._predict_fallback(
features, vocals, wav2vec2, clap, fst,
tower_scores,
)
def _predict_trained(
self,
features: AudioFeatures,
vocals: Optional[VocalFeatures],
wav2vec2: Optional[Wav2Vec2Result],
clap: Optional[CLAPResult],
fst: Optional[FSTResult],
tower_scores: dict,
) -> MetaResult:
"""Prediction using trained meta-classifier."""
# Build feature vector matching training columns
feat_dict = self._build_feature_dict(
features, vocals,
)
# Assemble in correct column order
x = np.array([
feat_dict.get(col, 0.0)
for col in self._feature_cols
], dtype=np.float32).reshape(1, -1)
x = np.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0)
x_scaled = self._scaler.transform(x)
# Predict
proba = self._model.predict_proba(x_scaled)[0]
p_ai = float(proba[1])
# FST calibration (not in trained model)
if fst and fst.available:
tower_scores["fst"] = fst.confidence
if (p_ai > 0.5) != fst.is_ai:
# Disagreement — moderate confidence
p_ai = p_ai * 0.85 + 0.5 * 0.15
is_ai = p_ai > 0.5
confidence = round(p_ai if is_ai else 1.0 - p_ai, 4)
# Generate indicators
indicators = self._build_indicators(
is_ai, confidence, features, vocals,
tower_scores,
)
# Feature importances for this prediction
top_features = self._get_top_features(x_scaled[0])
return MetaResult(
is_ai_generated=is_ai,
confidence=confidence,
model_version="auris-v2-trained",
decision_source="auris_meta",
analysis_mode="production",
tower_scores=tower_scores,
indicators=indicators,
top_features=top_features,
)
def _predict_fallback(
self,
features: AudioFeatures,
vocals: Optional[VocalFeatures],
wav2vec2: Optional[Wav2Vec2Result],
clap: Optional[CLAPResult],
fst: Optional[FSTResult],
tower_scores: dict,
) -> MetaResult:
"""
Fallback when trained model is not available.
Uses weighted averaging of available tower scores.
Better than heuristic-only but not data-driven.
"""
scores = []
weights = []
# wav2vec2 gets highest weight if available
if wav2vec2 and wav2vec2.available:
scores.append(wav2vec2.p_ai)
weights.append(0.40)
# Local features
local = tower_scores.get("local_features", 0.5)
scores.append(local)
weights.append(0.25 if wav2vec2 and wav2vec2.available else 0.45)
# Vocals
if vocals and vocals.has_vocals:
scores.append(vocals.vocal_ai_score)
weights.append(0.15)
# CLAP
if clap and clap.available:
scores.append(clap.confidence)
weights.append(0.10)
# FST
if fst and fst.available:
scores.append(fst.confidence)
weights.append(0.20)
# Weighted average
total_w = sum(weights)
p_ai = sum(
s * (w / total_w) for s, w in zip(scores, weights)
)
is_ai = p_ai > 0.5
confidence = round(max(0.51, min(0.97, p_ai)), 4)
indicators = self._build_indicators(
is_ai, confidence, features, vocals,
tower_scores,
)
return MetaResult(
is_ai_generated=is_ai,
confidence=confidence,
model_version="auris-v1-heuristic",
decision_source="auris_fallback",
analysis_mode="production",
tower_scores=tower_scores,
indicators=indicators,
)
def _build_feature_dict(
self,
features: AudioFeatures,
vocals: Optional[VocalFeatures],
) -> dict:
"""Build flat feature dict for meta-classifier."""
d = {
"duration_sec": features.duration_sec,
"sample_rate": features.sample_rate,
"rms_energy": features.rms_energy,
"tempo_bpm": features.tempo_bpm,
"tempo_stability": features.tempo_stability,
"spectral_centroid_mean": features.spectral_centroid_mean,
"spectral_centroid_std": features.spectral_centroid_std,
"spectral_flatness_mean": features.spectral_flatness_mean,
"mfcc_variance": features.mfcc_variance,
"chroma_entropy": features.chroma_entropy,
"harmonic_ratio": features.harmonic_ratio,
"zero_crossing_rate": features.zero_crossing_rate,
"spectral_regularity": features.spectral_regularity,
"temporal_patterns": features.temporal_patterns,
"harmonic_structure": features.harmonic_structure,
}
if vocals:
d.update({
"has_vocals": 1.0 if vocals.has_vocals else 0.0,
"vocal_confidence": vocals.vocal_confidence,
"vocal_ai_score": vocals.vocal_ai_score,
"pitch_stability_score": vocals.pitch_stability_score,
"vibrato_regularity_score": vocals.vibrato_regularity_score,
"formant_consistency_score": vocals.formant_consistency_score,
"breath_pattern_score": vocals.breath_pattern_score,
"vocal_texture_score": vocals.vocal_texture_score,
"pitch_mean_hz": vocals.pitch_mean_hz,
"pitch_std_cents": vocals.pitch_std_cents,
"vibrato_rate_hz": vocals.vibrato_rate_hz,
"vibrato_extent_cents": vocals.vibrato_extent_cents,
"vocal_harmonic_ratio": vocals.vocal_harmonic_ratio,
"vocal_energy_ratio": vocals.vocal_energy_ratio,
})
else:
for key in [
"has_vocals", "vocal_confidence", "vocal_ai_score",
"pitch_stability_score", "vibrato_regularity_score",
"formant_consistency_score", "breath_pattern_score",
"vocal_texture_score", "pitch_mean_hz",
"pitch_std_cents", "vibrato_rate_hz",
"vibrato_extent_cents", "vocal_harmonic_ratio",
"vocal_energy_ratio",
]:
d[key] = 0.0
return d
def _get_top_features(
self, x: np.ndarray, top_n: int = 5
) -> list[dict]:
"""
Get top contributing features for this prediction.
Uses feature_importances_ from tree models.
In future: SHAP values for per-sample explanation.
"""
if not hasattr(self._model, "feature_importances_"):
return []
importances = self._model.feature_importances_
indices = np.argsort(importances)[::-1][:top_n]
result = []
for idx in indices:
col_name = (
self._feature_cols[idx]
if idx < len(self._feature_cols)
else f"feature_{idx}"
)
result.append({
"feature": col_name,
"importance": round(float(importances[idx]), 4),
"value": round(float(x[idx]), 4),
})
return result
@staticmethod
def _build_indicators(
is_ai: bool,
confidence: float,
features: AudioFeatures,
vocals: Optional[VocalFeatures],
tower_scores: dict,
) -> list[str]:
"""Generate human-readable indicators."""
indicators = []
# Overall
label = "AI-generated" if is_ai else "human-composed"
if confidence > 0.85:
indicators.append(
f"High confidence: classified as {label}."
)
elif confidence > 0.70:
indicators.append(
f"Moderate confidence: likely {label}."
)
else:
indicators.append(
f"Low confidence: borderline {label}."
)
# Tower agreement
ai_towers = sum(
1 for v in tower_scores.values() if v > 0.5
)
human_towers = sum(
1 for v in tower_scores.values() if v <= 0.5
)
total = ai_towers + human_towers
if total > 1:
if ai_towers == total:
indicators.append(
f"All {total} analysis signals agree: AI-generated."
)
elif human_towers == total:
indicators.append(
f"All {total} analysis signals agree: human-composed."
)
else:
indicators.append(
f"Mixed signals: {ai_towers}/{total} indicate AI, "
f"{human_towers}/{total} indicate human."
)
# Spectral
if features.spectral_regularity > 0.7:
indicators.append(
"High spectral regularity — typical of AI synthesis."
)
elif features.spectral_regularity < 0.3:
indicators.append(
"Natural spectral variation — consistent with human recording."
)
# Temporal
if features.temporal_patterns > 0.7:
indicators.append(
f"Metronomic timing precision "
f"(tempo jitter: {features.tempo_stability:.3f}s)."
)
# Vocals
if vocals and vocals.has_vocals:
if vocals.vocal_ai_score > 0.7:
indicators.append(
"Vocal analysis indicates synthetic voice characteristics."
)
elif vocals.vocal_ai_score < 0.3:
indicators.append(
"Vocal patterns consistent with natural human singing."
)
if vocals.pitch_std_cents < 10:
indicators.append(
f"Pitch jitter ({vocals.pitch_std_cents:.1f} cents) "
"is unusually low — suggests synthetic vocal."
)
return indicators