Spaces:

Rthur2003
/

crowncode-backend

Sleeping

App Files Files Community

crowncode-backend / app /services /meta_classifier.py

Rthur2003

fix: add docstring to MetaClassifierService constructor for clarity

b018523 about 2 months ago

raw

history blame contribute delete

15.4 kB

	"""
	Meta-classifier service for AURIS score fusion.

	Replaces the hand-tuned weighted-average fusion with
	a trained stacking ensemble that combines signals from
	all analysis towers.

	Towers:
	1. wav2vec2 fine-tuned (logit + hidden stats)
	2. Librosa features (spectral, temporal, harmonic)
	3. Vocal analysis (pitch, vibrato, formant, breath)
	4. CLAP embeddings (when available)
	5. FST external (when available)

	The meta-classifier was trained on the same dataset
	with cross-validated tower outputs.
	"""

	from __future__ import annotations

	import json
	import pickle
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import List, Optional

	import numpy as np

	from .feature_extractor import AudioFeatures
	from .vocal_analyzer import VocalFeatures
	from .wav2vec2_detector import Wav2Vec2Result
	from .clap_detector import CLAPResult
	from .fst_client import FSTResult
	from .logging_config import get_logger

	logger = get_logger(__name__)

	_MODELS_DIR = Path(__file__).resolve().parents[2] / "models"


	@dataclass
	class MetaResult:
	"""Final detection result from meta-classifier."""

	is_ai_generated: bool
	confidence: float
	model_version: str = "auris-v2-meta"
	decision_source: str = "auris_meta"
	analysis_mode: str = "production"

	# Per-tower scores for transparency
	tower_scores: dict = field(default_factory=dict)

	# Explainable indicators (SHAP-based when available)
	indicators: List[str] = field(default_factory=list)

	# Feature importances for this prediction
	top_features: List[dict] = field(default_factory=list)


	class MetaClassifierService:
	"""
	Trained stacking meta-classifier.

	Combines all tower outputs into a single feature vector,
	runs the trained classifier, and generates explainable
	indicators.

	Falls back to simple averaging when trained model is
	not available (during development before first training).
	"""

	def __init__(self) -> None:
	"""Initialize meta-classifier with empty state."""
	self._model = None
	self._scaler = None
	self._feature_cols: list[str] = []
	self._initialized = False
	self._trained = False

	def _ensure_loaded(self) -> bool:
	"""Load trained meta-classifier if available."""
	if self._initialized:
	return self._trained

	self._initialized = True

	model_path = _MODELS_DIR / "auris_classifier_v1.pkl"
	scaler_path = _MODELS_DIR / "feature_scaler_v1.pkl"
	columns_path = _MODELS_DIR / "feature_columns_v1.json"

	if not model_path.exists():
	logger.info(
	"Meta-classifier not found. "
	"Using fallback fusion."
	)
	return False

	try:
	with open(model_path, "rb") as f:
	self._model = pickle.load(f)
	with open(scaler_path, "rb") as f:
	self._scaler = pickle.load(f)
	with open(columns_path, "r") as f:
	self._feature_cols = json.load(f)

	self._trained = True
	logger.info(
	f"Meta-classifier loaded: "
	f"{type(self._model).__name__}, "
	f"{len(self._feature_cols)} features"
	)
	return True

	except Exception as e:
	logger.error(f"Failed to load meta-classifier: {e}")
	return False

	def predict(
	self,
	features: AudioFeatures,
	vocals: Optional[VocalFeatures] = None,
	wav2vec2: Optional[Wav2Vec2Result] = None,
	clap: Optional[CLAPResult] = None,
	fst: Optional[FSTResult] = None,
	) -> MetaResult:
	"""
	Run meta-classifier on all tower outputs.

	Args:
	features: Librosa-extracted audio features.
	vocals: Vocal analysis results.
	wav2vec2: wav2vec2 tower result.
	clap: CLAP embedding result.
	fst: FST external API result.

	Returns:
	MetaResult with final prediction and explanations.
	"""
	is_trained = self._ensure_loaded()

	# Collect per-tower scores for transparency
	tower_scores = {}
	if wav2vec2 and wav2vec2.available:
	tower_scores["wav2vec2"] = wav2vec2.p_ai
	if clap and clap.available:
	tower_scores["clap"] = clap.confidence
	if fst and fst.available:
	tower_scores["fst"] = fst.confidence

	# Local feature score (heuristic, used as fallback signal)
	local_score = (
	features.spectral_regularity * 0.35
	+ features.temporal_patterns * 0.35
	+ features.harmonic_structure * 0.30
	)
	tower_scores["local_features"] = round(local_score, 4)

	if vocals and vocals.has_vocals:
	tower_scores["vocals"] = vocals.vocal_ai_score

	if is_trained:
	return self._predict_trained(
	features, vocals, wav2vec2, clap, fst,
	tower_scores,
	)
	else:
	return self._predict_fallback(
	features, vocals, wav2vec2, clap, fst,
	tower_scores,
	)

	def _predict_trained(
	self,
	features: AudioFeatures,
	vocals: Optional[VocalFeatures],
	wav2vec2: Optional[Wav2Vec2Result],
	clap: Optional[CLAPResult],
	fst: Optional[FSTResult],
	tower_scores: dict,
	) -> MetaResult:
	"""Prediction using trained meta-classifier."""
	# Build feature vector matching training columns
	feat_dict = self._build_feature_dict(
	features, vocals,
	)

	# Assemble in correct column order
	x = np.array([
	feat_dict.get(col, 0.0)
	for col in self._feature_cols
	], dtype=np.float32).reshape(1, -1)

	x = np.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0)
	x_scaled = self._scaler.transform(x)

	# Predict
	proba = self._model.predict_proba(x_scaled)[0]
	p_ai = float(proba[1])

	# FST calibration (not in trained model)
	if fst and fst.available:
	tower_scores["fst"] = fst.confidence
	if (p_ai > 0.5) != fst.is_ai:
	# Disagreement — moderate confidence
	p_ai = p_ai * 0.85 + 0.5 * 0.15

	is_ai = p_ai > 0.5
	confidence = round(p_ai if is_ai else 1.0 - p_ai, 4)

	# Generate indicators
	indicators = self._build_indicators(
	is_ai, confidence, features, vocals,
	tower_scores,
	)

	# Feature importances for this prediction
	top_features = self._get_top_features(x_scaled[0])

	return MetaResult(
	is_ai_generated=is_ai,
	confidence=confidence,
	model_version="auris-v2-trained",
	decision_source="auris_meta",
	analysis_mode="production",
	tower_scores=tower_scores,
	indicators=indicators,
	top_features=top_features,
	)

	def _predict_fallback(
	self,
	features: AudioFeatures,
	vocals: Optional[VocalFeatures],
	wav2vec2: Optional[Wav2Vec2Result],
	clap: Optional[CLAPResult],
	fst: Optional[FSTResult],
	tower_scores: dict,
	) -> MetaResult:
	"""
	Fallback when trained model is not available.

	Uses weighted averaging of available tower scores.
	Better than heuristic-only but not data-driven.
	"""
	scores = []
	weights = []

	# wav2vec2 gets highest weight if available
	if wav2vec2 and wav2vec2.available:
	scores.append(wav2vec2.p_ai)
	weights.append(0.40)

	# Local features
	local = tower_scores.get("local_features", 0.5)
	scores.append(local)
	weights.append(0.25 if wav2vec2 and wav2vec2.available else 0.45)

	# Vocals
	if vocals and vocals.has_vocals:
	scores.append(vocals.vocal_ai_score)
	weights.append(0.15)

	# CLAP
	if clap and clap.available:
	scores.append(clap.confidence)
	weights.append(0.10)

	# FST
	if fst and fst.available:
	scores.append(fst.confidence)
	weights.append(0.20)

	# Weighted average
	total_w = sum(weights)
	p_ai = sum(
	s * (w / total_w) for s, w in zip(scores, weights)
	)

	is_ai = p_ai > 0.5
	confidence = round(max(0.51, min(0.97, p_ai)), 4)

	indicators = self._build_indicators(
	is_ai, confidence, features, vocals,
	tower_scores,
	)

	return MetaResult(
	is_ai_generated=is_ai,
	confidence=confidence,
	model_version="auris-v1-heuristic",
	decision_source="auris_fallback",
	analysis_mode="production",
	tower_scores=tower_scores,
	indicators=indicators,
	)

	def _build_feature_dict(
	self,
	features: AudioFeatures,
	vocals: Optional[VocalFeatures],
	) -> dict:
	"""Build flat feature dict for meta-classifier."""
	d = {
	"duration_sec": features.duration_sec,
	"sample_rate": features.sample_rate,
	"rms_energy": features.rms_energy,
	"tempo_bpm": features.tempo_bpm,
	"tempo_stability": features.tempo_stability,
	"spectral_centroid_mean": features.spectral_centroid_mean,
	"spectral_centroid_std": features.spectral_centroid_std,
	"spectral_flatness_mean": features.spectral_flatness_mean,
	"mfcc_variance": features.mfcc_variance,
	"chroma_entropy": features.chroma_entropy,
	"harmonic_ratio": features.harmonic_ratio,
	"zero_crossing_rate": features.zero_crossing_rate,
	"spectral_regularity": features.spectral_regularity,
	"temporal_patterns": features.temporal_patterns,
	"harmonic_structure": features.harmonic_structure,
	}

	if vocals:
	d.update({
	"has_vocals": 1.0 if vocals.has_vocals else 0.0,
	"vocal_confidence": vocals.vocal_confidence,
	"vocal_ai_score": vocals.vocal_ai_score,
	"pitch_stability_score": vocals.pitch_stability_score,
	"vibrato_regularity_score": vocals.vibrato_regularity_score,
	"formant_consistency_score": vocals.formant_consistency_score,
	"breath_pattern_score": vocals.breath_pattern_score,
	"vocal_texture_score": vocals.vocal_texture_score,
	"pitch_mean_hz": vocals.pitch_mean_hz,
	"pitch_std_cents": vocals.pitch_std_cents,
	"vibrato_rate_hz": vocals.vibrato_rate_hz,
	"vibrato_extent_cents": vocals.vibrato_extent_cents,
	"vocal_harmonic_ratio": vocals.vocal_harmonic_ratio,
	"vocal_energy_ratio": vocals.vocal_energy_ratio,
	})
	else:
	for key in [
	"has_vocals", "vocal_confidence", "vocal_ai_score",
	"pitch_stability_score", "vibrato_regularity_score",
	"formant_consistency_score", "breath_pattern_score",
	"vocal_texture_score", "pitch_mean_hz",
	"pitch_std_cents", "vibrato_rate_hz",
	"vibrato_extent_cents", "vocal_harmonic_ratio",
	"vocal_energy_ratio",
	]:
	d[key] = 0.0

	return d

	def _get_top_features(
	self, x: np.ndarray, top_n: int = 5
	) -> list[dict]:
	"""
	Get top contributing features for this prediction.

	Uses feature_importances_ from tree models.
	In future: SHAP values for per-sample explanation.
	"""
	if not hasattr(self._model, "feature_importances_"):
	return []

	importances = self._model.feature_importances_
	indices = np.argsort(importances)[::-1][:top_n]

	result = []
	for idx in indices:
	col_name = (
	self._feature_cols[idx]
	if idx < len(self._feature_cols)
	else f"feature_{idx}"
	)
	result.append({
	"feature": col_name,
	"importance": round(float(importances[idx]), 4),
	"value": round(float(x[idx]), 4),
	})

	return result

	@staticmethod
	def _build_indicators(
	is_ai: bool,
	confidence: float,
	features: AudioFeatures,
	vocals: Optional[VocalFeatures],
	tower_scores: dict,
	) -> list[str]:
	"""Generate human-readable indicators."""
	indicators = []

	# Overall
	label = "AI-generated" if is_ai else "human-composed"
	if confidence > 0.85:
	indicators.append(
	f"High confidence: classified as {label}."
	)
	elif confidence > 0.70:
	indicators.append(
	f"Moderate confidence: likely {label}."
	)
	else:
	indicators.append(
	f"Low confidence: borderline {label}."
	)

	# Tower agreement
	ai_towers = sum(
	1 for v in tower_scores.values() if v > 0.5
	)
	human_towers = sum(
	1 for v in tower_scores.values() if v <= 0.5
	)
	total = ai_towers + human_towers

	if total > 1:
	if ai_towers == total:
	indicators.append(
	f"All {total} analysis signals agree: AI-generated."
	)
	elif human_towers == total:
	indicators.append(
	f"All {total} analysis signals agree: human-composed."
	)
	else:
	indicators.append(
	f"Mixed signals: {ai_towers}/{total} indicate AI, "
	f"{human_towers}/{total} indicate human."
	)

	# Spectral
	if features.spectral_regularity > 0.7:
	indicators.append(
	"High spectral regularity — typical of AI synthesis."
	)
	elif features.spectral_regularity < 0.3:
	indicators.append(
	"Natural spectral variation — consistent with human recording."
	)

	# Temporal
	if features.temporal_patterns > 0.7:
	indicators.append(
	f"Metronomic timing precision "
	f"(tempo jitter: {features.tempo_stability:.3f}s)."
	)

	# Vocals
	if vocals and vocals.has_vocals:
	if vocals.vocal_ai_score > 0.7:
	indicators.append(
	"Vocal analysis indicates synthetic voice characteristics."
	)
	elif vocals.vocal_ai_score < 0.3:
	indicators.append(
	"Vocal patterns consistent with natural human singing."
	)

	if vocals.pitch_std_cents < 10:
	indicators.append(
	f"Pitch jitter ({vocals.pitch_std_cents:.1f} cents) "
	"is unusually low — suggests synthetic vocal."
	)

	return indicators