Spaces:
Sleeping
Sleeping
File size: 5,735 Bytes
bc1975b 4b87da0 bc1975b 4b87da0 bc1975b 20fe6c3 bc1975b 20fe6c3 bc1975b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | """
Evaluation framework for AURIS models.
Measures accuracy, precision, recall, F1, ROC-AUC
against labeled data. Used for:
1. Baseline measurement of heuristic system
2. Validation of trained models
3. A/B comparison between model versions
"""
from __future__ import annotations
import csv
import sys
from pathlib import Path
from typing import Optional
import numpy as np
try:
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
confusion_matrix,
classification_report,
)
except ImportError:
print("ERROR: scikit-learn required. pip install scikit-learn")
sys.exit(1)
def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
"""
Load features CSV into X (features) and y (labels).
Returns:
X: (n_samples, n_features) array
y: (n_samples,) array of 0/1 labels
"""
rows = []
labels = []
with open(path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
_EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
feature_cols = [
c for c in reader.fieldnames
if c not in _EXCLUDE
]
for row in reader:
feat_values = []
for col in feature_cols:
try:
feat_values.append(float(row[col]))
except (ValueError, KeyError):
feat_values.append(0.0)
rows.append(feat_values)
labels.append(int(row["label_int"]))
X = np.array(rows, dtype=np.float32)
y = np.array(labels, dtype=np.int32)
print(f"Loaded {len(y)} samples, {X.shape[1]} features")
print(f" AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}")
return X, y
def evaluate_predictions(
y_true: np.ndarray,
y_pred: np.ndarray,
y_prob: Optional[np.ndarray] = None,
title: str = "Model",
) -> dict:
"""
Compute and print all evaluation metrics.
Args:
y_true: Ground truth labels (0/1).
y_pred: Predicted labels (0/1).
y_prob: Predicted probabilities for positive class.
title: Title for the report.
Returns:
Dict of metric name -> value.
"""
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
metrics = {
"accuracy": round(acc, 4),
"precision": round(prec, 4),
"recall": round(rec, 4),
"f1_score": round(f1, 4),
}
if y_prob is not None:
try:
auc = roc_auc_score(y_true, y_prob)
metrics["roc_auc"] = round(auc, 4)
except ValueError:
metrics["roc_auc"] = None
cm = confusion_matrix(y_true, y_pred)
# Print report
print(f"\n{'=' * 50}")
print(f" {title} - Evaluation Report")
print(f"{'=' * 50}")
print(f" Accuracy: {acc:.4f} ({acc:.1%})")
print(f" Precision: {prec:.4f}")
print(f" Recall: {rec:.4f}")
print(f" F1 Score: {f1:.4f}")
if "roc_auc" in metrics and metrics["roc_auc"] is not None:
print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
print(f"\n Confusion Matrix:")
print(f" Predicted")
print(f" Actual Human AI")
print(f" Human {cm[0][0]:>6} {cm[0][1]:>6}")
print(f" AI {cm[1][0]:>6} {cm[1][1]:>6}")
print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}")
return metrics
def evaluate_heuristic_baseline(features_csv: str | Path) -> dict:
"""
Evaluate the current heuristic scoring system as baseline.
The heuristic system uses the 'spectral_regularity',
'temporal_patterns', 'harmonic_structure' scores
(which are sigmoid-transformed heuristics) to make
a weighted average prediction.
"""
X, y = load_features_csv(features_csv)
# Read feature column names
with open(features_csv, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
feature_cols = [
c for c in reader.fieldnames
if c not in ("file_path", "label_int")
]
# Find indices of heuristic score columns
sr_idx = feature_cols.index("spectral_regularity")
tp_idx = feature_cols.index("temporal_patterns")
hs_idx = feature_cols.index("harmonic_structure")
# Current heuristic: weighted average
heuristic_scores = (
X[:, sr_idx] * 0.35
+ X[:, tp_idx] * 0.35
+ X[:, hs_idx] * 0.30
)
# Also try with vocal score if available
vai_idx = feature_cols.index("vocal_ai_score")
has_v_idx = feature_cols.index("has_vocals")
combined_scores = np.where(
X[:, has_v_idx] > 0.5,
heuristic_scores * 0.65 + X[:, vai_idx] * 0.35,
heuristic_scores,
)
y_pred_heuristic = (heuristic_scores > 0.5).astype(int)
y_pred_combined = (combined_scores > 0.5).astype(int)
print("\n" + "=" * 60)
print(" BASELINE EVALUATION - Current Heuristic System")
print("=" * 60)
print("\n--- Heuristic Only (spectral + temporal + harmonic) ---")
m1 = evaluate_predictions(
y, y_pred_heuristic, heuristic_scores,
title="Heuristic (no vocals)",
)
print("\n--- Heuristic + Vocal Score ---")
m2 = evaluate_predictions(
y, y_pred_combined, combined_scores,
title="Heuristic + Vocals",
)
return {"heuristic_only": m1, "heuristic_vocals": m2}
if __name__ == "__main__":
csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
evaluate_heuristic_baseline(csv_path)
|