crowncode-backend / app /training /evaluate.py
Rthur2003's picture
Add Gradio dependency to requirements.txt with version constraints
20fe6c3
"""
Evaluation framework for AURIS models.
Measures accuracy, precision, recall, F1, ROC-AUC
against labeled data. Used for:
1. Baseline measurement of heuristic system
2. Validation of trained models
3. A/B comparison between model versions
"""
from __future__ import annotations
import csv
import sys
from pathlib import Path
from typing import Optional
import numpy as np
try:
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
confusion_matrix,
classification_report,
)
except ImportError:
print("ERROR: scikit-learn required. pip install scikit-learn")
sys.exit(1)
def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
"""
Load features CSV into X (features) and y (labels).
Returns:
X: (n_samples, n_features) array
y: (n_samples,) array of 0/1 labels
"""
rows = []
labels = []
with open(path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
_EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
feature_cols = [
c for c in reader.fieldnames
if c not in _EXCLUDE
]
for row in reader:
feat_values = []
for col in feature_cols:
try:
feat_values.append(float(row[col]))
except (ValueError, KeyError):
feat_values.append(0.0)
rows.append(feat_values)
labels.append(int(row["label_int"]))
X = np.array(rows, dtype=np.float32)
y = np.array(labels, dtype=np.int32)
print(f"Loaded {len(y)} samples, {X.shape[1]} features")
print(f" AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}")
return X, y
def evaluate_predictions(
y_true: np.ndarray,
y_pred: np.ndarray,
y_prob: Optional[np.ndarray] = None,
title: str = "Model",
) -> dict:
"""
Compute and print all evaluation metrics.
Args:
y_true: Ground truth labels (0/1).
y_pred: Predicted labels (0/1).
y_prob: Predicted probabilities for positive class.
title: Title for the report.
Returns:
Dict of metric name -> value.
"""
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
metrics = {
"accuracy": round(acc, 4),
"precision": round(prec, 4),
"recall": round(rec, 4),
"f1_score": round(f1, 4),
}
if y_prob is not None:
try:
auc = roc_auc_score(y_true, y_prob)
metrics["roc_auc"] = round(auc, 4)
except ValueError:
metrics["roc_auc"] = None
cm = confusion_matrix(y_true, y_pred)
# Print report
print(f"\n{'=' * 50}")
print(f" {title} - Evaluation Report")
print(f"{'=' * 50}")
print(f" Accuracy: {acc:.4f} ({acc:.1%})")
print(f" Precision: {prec:.4f}")
print(f" Recall: {rec:.4f}")
print(f" F1 Score: {f1:.4f}")
if "roc_auc" in metrics and metrics["roc_auc"] is not None:
print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
print(f"\n Confusion Matrix:")
print(f" Predicted")
print(f" Actual Human AI")
print(f" Human {cm[0][0]:>6} {cm[0][1]:>6}")
print(f" AI {cm[1][0]:>6} {cm[1][1]:>6}")
print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}")
return metrics
def evaluate_heuristic_baseline(features_csv: str | Path) -> dict:
"""
Evaluate the current heuristic scoring system as baseline.
The heuristic system uses the 'spectral_regularity',
'temporal_patterns', 'harmonic_structure' scores
(which are sigmoid-transformed heuristics) to make
a weighted average prediction.
"""
X, y = load_features_csv(features_csv)
# Read feature column names
with open(features_csv, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
feature_cols = [
c for c in reader.fieldnames
if c not in ("file_path", "label_int")
]
# Find indices of heuristic score columns
sr_idx = feature_cols.index("spectral_regularity")
tp_idx = feature_cols.index("temporal_patterns")
hs_idx = feature_cols.index("harmonic_structure")
# Current heuristic: weighted average
heuristic_scores = (
X[:, sr_idx] * 0.35
+ X[:, tp_idx] * 0.35
+ X[:, hs_idx] * 0.30
)
# Also try with vocal score if available
vai_idx = feature_cols.index("vocal_ai_score")
has_v_idx = feature_cols.index("has_vocals")
combined_scores = np.where(
X[:, has_v_idx] > 0.5,
heuristic_scores * 0.65 + X[:, vai_idx] * 0.35,
heuristic_scores,
)
y_pred_heuristic = (heuristic_scores > 0.5).astype(int)
y_pred_combined = (combined_scores > 0.5).astype(int)
print("\n" + "=" * 60)
print(" BASELINE EVALUATION - Current Heuristic System")
print("=" * 60)
print("\n--- Heuristic Only (spectral + temporal + harmonic) ---")
m1 = evaluate_predictions(
y, y_pred_heuristic, heuristic_scores,
title="Heuristic (no vocals)",
)
print("\n--- Heuristic + Vocal Score ---")
m2 = evaluate_predictions(
y, y_pred_combined, combined_scores,
title="Heuristic + Vocals",
)
return {"heuristic_only": m1, "heuristic_vocals": m2}
if __name__ == "__main__":
csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
evaluate_heuristic_baseline(csv_path)