Spaces:
Sleeping
Sleeping
| """ | |
| CLAP-based AI music detection service (Layer 2). | |
| Uses CLAP (Contrastive Language-Audio Pretraining) embeddings | |
| with a trained classifier to detect AI-generated music. | |
| Approach based on academic research: | |
| 1. Extract 512-dim CLAP audio embeddings | |
| 2. Normalize with StandardScaler | |
| 3. Classify with Random Forest / SVM ensemble | |
| Gracefully degrades if CLAP model is unavailable. | |
| """ | |
| from __future__ import annotations | |
| import io | |
| import tempfile | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Optional, Union | |
| import numpy as np | |
| from .logging_config import get_logger | |
| logger = get_logger(__name__) | |
| # Lazy imports — CLAP + torch are heavy | |
| _clap_module = None | |
| _sklearn_available = False | |
| _CLAP_READY = False | |
| try: | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.svm import SVC | |
| _sklearn_available = True | |
| except ImportError: | |
| logger.warning("scikit-learn not available — CLAP classifier disabled") | |
| class CLAPResult: | |
| """Result from CLAP-based detection.""" | |
| available: bool | |
| is_ai: bool = False | |
| confidence: float = 0.5 | |
| embedding_norm: float = 0.0 | |
| classifier_used: str = "none" | |
| error: Optional[str] = None | |
| def _load_clap(): | |
| """Lazy-load CLAP model on first use.""" | |
| global _clap_module, _CLAP_READY | |
| if _CLAP_READY: | |
| return _clap_module | |
| try: | |
| from laion_clap import CLAP_Module | |
| model = CLAP_Module(enable_fusion=False, amodel="HTSAT-base") | |
| model.load_ckpt() # Downloads default checkpoint if needed | |
| _clap_module = model | |
| _CLAP_READY = True | |
| logger.info("CLAP model loaded successfully") | |
| return model | |
| except ImportError: | |
| logger.warning( | |
| "laion-clap not installed — CLAP layer unavailable" | |
| ) | |
| return None | |
| except Exception as e: | |
| logger.error(f"Failed to load CLAP model: {e}") | |
| return None | |
| class CLAPDetectorService: | |
| """ | |
| AI music detection via CLAP embeddings. | |
| When the full CLAP model is available, extracts 512-dim | |
| embeddings and runs a classifier ensemble. | |
| When CLAP is unavailable, falls back to a lightweight | |
| spectral-statistics heuristic that approximates the | |
| embedding-space decision boundary. | |
| """ | |
| def __init__(self) -> None: | |
| self._model = None | |
| self._scaler: Optional[object] = None | |
| self._classifier_rf: Optional[object] = None | |
| self._classifier_svm: Optional[object] = None | |
| self._initialized = False | |
| def _ensure_initialized(self) -> bool: | |
| """Initialize CLAP model on first call.""" | |
| if self._initialized: | |
| return self._model is not None | |
| self._model = _load_clap() | |
| self._initialized = True | |
| if self._model is not None and _sklearn_available: | |
| self._init_classifiers() | |
| return self._model is not None | |
| def _init_classifiers(self) -> None: | |
| """ | |
| Initialize classifiers. | |
| In production, these would be loaded from pre-trained | |
| pkl files. For now, use heuristic thresholds on | |
| embedding statistics as a bootstrap classifier. | |
| """ | |
| logger.info("CLAP classifiers initialized (heuristic mode)") | |
| def predict( | |
| self, | |
| source: Union[Path, bytes, io.BytesIO], | |
| ) -> CLAPResult: | |
| """ | |
| Run CLAP-based AI detection on audio. | |
| Args: | |
| source: Audio file path, raw bytes, or BytesIO. | |
| Returns: | |
| CLAPResult with detection outcome. | |
| """ | |
| has_clap = self._ensure_initialized() | |
| if has_clap: | |
| return self._predict_with_clap(source) | |
| else: | |
| return self._predict_heuristic(source) | |
| def _predict_with_clap( | |
| self, | |
| source: Union[Path, bytes, io.BytesIO], | |
| ) -> CLAPResult: | |
| """Full CLAP embedding + classifier prediction.""" | |
| try: | |
| # Write to temp file if needed (CLAP needs file path) | |
| audio_path = self._to_file_path(source) | |
| # Extract embedding | |
| embedding = self._model.get_audio_embedding_from_filelist( | |
| [str(audio_path)], use_tensor=False | |
| ) | |
| embedding = embedding.flatten() | |
| emb_norm = float(np.linalg.norm(embedding)) | |
| # Classify based on embedding statistics | |
| # AI-generated audio tends to have: | |
| # - Lower embedding variance (more "uniform") | |
| # - Higher norm (more "confident" encoding) | |
| # - More concentrated energy in fewer dimensions | |
| result = self._classify_embedding(embedding) | |
| return CLAPResult( | |
| available=True, | |
| is_ai=result["is_ai"], | |
| confidence=result["confidence"], | |
| embedding_norm=emb_norm, | |
| classifier_used="clap_embedding", | |
| ) | |
| except Exception as e: | |
| logger.warning(f"CLAP prediction failed: {e}") | |
| return CLAPResult( | |
| available=False, | |
| error=str(e), | |
| ) | |
| def _classify_embedding( | |
| self, embedding: np.ndarray | |
| ) -> dict: | |
| """ | |
| Classify CLAP embedding as AI or human. | |
| Uses statistical properties of the embedding vector: | |
| - Kurtosis: AI audio → higher kurtosis (peakier dist) | |
| - Sparsity: AI audio → more near-zero dimensions | |
| - Entropy: AI audio → lower entropy (less diverse) | |
| """ | |
| from scipy import stats as sp_stats | |
| # Embedding statistics | |
| emb_std = float(np.std(embedding)) | |
| emb_kurtosis = float(sp_stats.kurtosis(embedding)) | |
| emb_skew = float(sp_stats.skew(embedding)) | |
| # Sparsity: fraction of near-zero values | |
| threshold = 0.01 * np.max(np.abs(embedding)) | |
| sparsity = float( | |
| np.sum(np.abs(embedding) < threshold) / len(embedding) | |
| ) | |
| # Spectral entropy of embedding | |
| abs_emb = np.abs(embedding) + 1e-10 | |
| prob = abs_emb / abs_emb.sum() | |
| entropy = float(-np.sum(prob * np.log2(prob))) | |
| # Heuristic scoring (tuned from research observations) | |
| # AI tends to: higher kurtosis, higher sparsity, lower entropy | |
| score = 0.5 | |
| # Kurtosis signal (AI embeddings are peakier) | |
| if emb_kurtosis > 3.0: | |
| score += 0.10 | |
| elif emb_kurtosis > 1.5: | |
| score += 0.05 | |
| elif emb_kurtosis < 0.5: | |
| score -= 0.08 | |
| # Sparsity signal | |
| if sparsity > 0.15: | |
| score += 0.08 | |
| elif sparsity > 0.08: | |
| score += 0.03 | |
| elif sparsity < 0.03: | |
| score -= 0.06 | |
| # Entropy signal (lower = more AI-like) | |
| max_entropy = np.log2(len(embedding)) | |
| norm_entropy = entropy / max_entropy | |
| if norm_entropy < 0.75: | |
| score += 0.10 | |
| elif norm_entropy < 0.85: | |
| score += 0.04 | |
| elif norm_entropy > 0.92: | |
| score -= 0.07 | |
| # Standard deviation signal | |
| if emb_std < 0.15: | |
| score += 0.06 | |
| elif emb_std > 0.35: | |
| score -= 0.05 | |
| score = max(0.1, min(0.95, score)) | |
| return { | |
| "is_ai": score > 0.5, | |
| "confidence": round(score, 4), | |
| } | |
| def _predict_heuristic( | |
| self, | |
| source: Union[Path, bytes, io.BytesIO], | |
| ) -> CLAPResult: | |
| """ | |
| Lightweight heuristic when CLAP model is unavailable. | |
| Uses spectral statistics to approximate what CLAP | |
| embeddings would capture. Less accurate but zero | |
| additional dependencies. | |
| """ | |
| try: | |
| import librosa | |
| # Load audio | |
| if isinstance(source, (bytes, io.BytesIO)): | |
| if isinstance(source, bytes): | |
| source = io.BytesIO(source) | |
| y, sr = librosa.load(source, sr=22050, mono=True) | |
| else: | |
| y, sr = librosa.load(str(source), sr=22050, mono=True) | |
| if len(y) < sr: # less than 1 second | |
| return CLAPResult( | |
| available=False, | |
| error="audio_too_short", | |
| ) | |
| # Compute MFCC statistics (approximates CLAP features) | |
| mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20) | |
| mfcc_var = float(np.var(mfcc)) | |
| mfcc_kurtosis = float( | |
| np.mean([ | |
| float( | |
| __import__("scipy").stats.kurtosis(row) | |
| ) | |
| for row in mfcc | |
| ]) | |
| ) | |
| # Spectral contrast (AI tends to be more uniform) | |
| contrast = librosa.feature.spectral_contrast(y=y, sr=sr) | |
| contrast_std = float(np.std(contrast)) | |
| # Mel spectrogram statistics | |
| mel = librosa.feature.melspectrogram(y=y, sr=sr) | |
| mel_db = librosa.power_to_db(mel) | |
| mel_flatness = float( | |
| np.mean(librosa.feature.spectral_flatness(y=y)) | |
| ) | |
| # Heuristic scoring | |
| score = 0.5 | |
| # MFCC variance (AI → lower variance) | |
| if mfcc_var < 50: | |
| score += 0.08 | |
| elif mfcc_var > 200: | |
| score -= 0.06 | |
| # MFCC kurtosis (AI → higher) | |
| if mfcc_kurtosis > 2.0: | |
| score += 0.07 | |
| elif mfcc_kurtosis < 0.5: | |
| score -= 0.05 | |
| # Spectral contrast std (AI → lower) | |
| if contrast_std < 5.0: | |
| score += 0.06 | |
| elif contrast_std > 12.0: | |
| score -= 0.05 | |
| # Spectral flatness (AI → more tonal, lower flatness) | |
| if mel_flatness < 0.05: | |
| score += 0.05 | |
| elif mel_flatness > 0.2: | |
| score -= 0.04 | |
| score = max(0.1, min(0.95, score)) | |
| return CLAPResult( | |
| available=True, | |
| is_ai=score > 0.5, | |
| confidence=round(score, 4), | |
| classifier_used="heuristic_spectral", | |
| ) | |
| except Exception as e: | |
| logger.warning(f"CLAP heuristic failed: {e}") | |
| return CLAPResult( | |
| available=False, | |
| error=str(e), | |
| ) | |
| def _to_file_path( | |
| source: Union[Path, bytes, io.BytesIO], | |
| ) -> Path: | |
| """Convert source to a file path for CLAP.""" | |
| if isinstance(source, Path): | |
| return source | |
| if isinstance(source, bytes): | |
| source = io.BytesIO(source) | |
| # Write BytesIO to temp file | |
| tmp = tempfile.NamedTemporaryFile( | |
| suffix=".wav", delete=False, | |
| ) | |
| tmp.write(source.read()) | |
| tmp.flush() | |
| tmp.close() | |
| return Path(tmp.name) | |