Spaces:
Sleeping
Sleeping
| """ | |
| Evaluation framework for AURIS models. | |
| Measures accuracy, precision, recall, F1, ROC-AUC | |
| against labeled data. Used for: | |
| 1. Baseline measurement of heuristic system | |
| 2. Validation of trained models | |
| 3. A/B comparison between model versions | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import sys | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| try: | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| precision_score, | |
| recall_score, | |
| f1_score, | |
| roc_auc_score, | |
| confusion_matrix, | |
| classification_report, | |
| ) | |
| except ImportError: | |
| print("ERROR: scikit-learn required. pip install scikit-learn") | |
| sys.exit(1) | |
| def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]: | |
| """ | |
| Load features CSV into X (features) and y (labels). | |
| Returns: | |
| X: (n_samples, n_features) array | |
| y: (n_samples,) array of 0/1 labels | |
| """ | |
| rows = [] | |
| labels = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| _EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"} | |
| feature_cols = [ | |
| c for c in reader.fieldnames | |
| if c not in _EXCLUDE | |
| ] | |
| for row in reader: | |
| feat_values = [] | |
| for col in feature_cols: | |
| try: | |
| feat_values.append(float(row[col])) | |
| except (ValueError, KeyError): | |
| feat_values.append(0.0) | |
| rows.append(feat_values) | |
| labels.append(int(row["label_int"])) | |
| X = np.array(rows, dtype=np.float32) | |
| y = np.array(labels, dtype=np.int32) | |
| print(f"Loaded {len(y)} samples, {X.shape[1]} features") | |
| print(f" AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}") | |
| return X, y | |
| def evaluate_predictions( | |
| y_true: np.ndarray, | |
| y_pred: np.ndarray, | |
| y_prob: Optional[np.ndarray] = None, | |
| title: str = "Model", | |
| ) -> dict: | |
| """ | |
| Compute and print all evaluation metrics. | |
| Args: | |
| y_true: Ground truth labels (0/1). | |
| y_pred: Predicted labels (0/1). | |
| y_prob: Predicted probabilities for positive class. | |
| title: Title for the report. | |
| Returns: | |
| Dict of metric name -> value. | |
| """ | |
| acc = accuracy_score(y_true, y_pred) | |
| prec = precision_score(y_true, y_pred, zero_division=0) | |
| rec = recall_score(y_true, y_pred, zero_division=0) | |
| f1 = f1_score(y_true, y_pred, zero_division=0) | |
| metrics = { | |
| "accuracy": round(acc, 4), | |
| "precision": round(prec, 4), | |
| "recall": round(rec, 4), | |
| "f1_score": round(f1, 4), | |
| } | |
| if y_prob is not None: | |
| try: | |
| auc = roc_auc_score(y_true, y_prob) | |
| metrics["roc_auc"] = round(auc, 4) | |
| except ValueError: | |
| metrics["roc_auc"] = None | |
| cm = confusion_matrix(y_true, y_pred) | |
| # Print report | |
| print(f"\n{'=' * 50}") | |
| print(f" {title} - Evaluation Report") | |
| print(f"{'=' * 50}") | |
| print(f" Accuracy: {acc:.4f} ({acc:.1%})") | |
| print(f" Precision: {prec:.4f}") | |
| print(f" Recall: {rec:.4f}") | |
| print(f" F1 Score: {f1:.4f}") | |
| if "roc_auc" in metrics and metrics["roc_auc"] is not None: | |
| print(f" ROC-AUC: {metrics['roc_auc']:.4f}") | |
| print(f"\n Confusion Matrix:") | |
| print(f" Predicted") | |
| print(f" Actual Human AI") | |
| print(f" Human {cm[0][0]:>6} {cm[0][1]:>6}") | |
| print(f" AI {cm[1][0]:>6} {cm[1][1]:>6}") | |
| print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}") | |
| return metrics | |
| def evaluate_heuristic_baseline(features_csv: str | Path) -> dict: | |
| """ | |
| Evaluate the current heuristic scoring system as baseline. | |
| The heuristic system uses the 'spectral_regularity', | |
| 'temporal_patterns', 'harmonic_structure' scores | |
| (which are sigmoid-transformed heuristics) to make | |
| a weighted average prediction. | |
| """ | |
| X, y = load_features_csv(features_csv) | |
| # Read feature column names | |
| with open(features_csv, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| feature_cols = [ | |
| c for c in reader.fieldnames | |
| if c not in ("file_path", "label_int") | |
| ] | |
| # Find indices of heuristic score columns | |
| sr_idx = feature_cols.index("spectral_regularity") | |
| tp_idx = feature_cols.index("temporal_patterns") | |
| hs_idx = feature_cols.index("harmonic_structure") | |
| # Current heuristic: weighted average | |
| heuristic_scores = ( | |
| X[:, sr_idx] * 0.35 | |
| + X[:, tp_idx] * 0.35 | |
| + X[:, hs_idx] * 0.30 | |
| ) | |
| # Also try with vocal score if available | |
| vai_idx = feature_cols.index("vocal_ai_score") | |
| has_v_idx = feature_cols.index("has_vocals") | |
| combined_scores = np.where( | |
| X[:, has_v_idx] > 0.5, | |
| heuristic_scores * 0.65 + X[:, vai_idx] * 0.35, | |
| heuristic_scores, | |
| ) | |
| y_pred_heuristic = (heuristic_scores > 0.5).astype(int) | |
| y_pred_combined = (combined_scores > 0.5).astype(int) | |
| print("\n" + "=" * 60) | |
| print(" BASELINE EVALUATION - Current Heuristic System") | |
| print("=" * 60) | |
| print("\n--- Heuristic Only (spectral + temporal + harmonic) ---") | |
| m1 = evaluate_predictions( | |
| y, y_pred_heuristic, heuristic_scores, | |
| title="Heuristic (no vocals)", | |
| ) | |
| print("\n--- Heuristic + Vocal Score ---") | |
| m2 = evaluate_predictions( | |
| y, y_pred_combined, combined_scores, | |
| title="Heuristic + Vocals", | |
| ) | |
| return {"heuristic_only": m1, "heuristic_vocals": m2} | |
| if __name__ == "__main__": | |
| csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv" | |
| evaluate_heuristic_baseline(csv_path) | |