Spaces:

Rthur2003
/

crowncode-backend

Sleeping

App Files Files Community

Rthur2003 commited on Mar 28

Commit

bc1975b

1 Parent(s): bfbcec4

feat: add evaluation framework for AURIS models with metrics computation

Browse files

Files changed (1) hide show

app/training/evaluate.py +198 -0

app/training/evaluate.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Evaluation framework for AURIS models.
+Measures accuracy, precision, recall, F1, ROC-AUC
+against labeled data. Used for:
+  1. Baseline measurement of heuristic system
+  2. Validation of trained models
+  3. A/B comparison between model versions
+"""
+from __future__ import annotations
+import csv
+import sys
+from pathlib import Path
+from typing import Optional
+import numpy as np
+try:
+    from sklearn.metrics import (
+        accuracy_score,
+        precision_score,
+        recall_score,
+        f1_score,
+        roc_auc_score,
+        confusion_matrix,
+        classification_report,
+    )
+except ImportError:
+    print("ERROR: scikit-learn required. pip install scikit-learn")
+    sys.exit(1)
+def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Load features CSV into X (features) and y (labels).
+    Returns:
+        X: (n_samples, n_features) array
+        y: (n_samples,) array of 0/1 labels
+    """
+    rows = []
+    labels = []
+    with open(path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        feature_cols = [
+            c for c in reader.fieldnames
+            if c not in ("file_path", "label_int")
+        ]
+        for row in reader:
+            feat_values = []
+            for col in feature_cols:
+                try:
+                    feat_values.append(float(row[col]))
+                except (ValueError, KeyError):
+                    feat_values.append(0.0)
+            rows.append(feat_values)
+            labels.append(int(row["label_int"]))
+    X = np.array(rows, dtype=np.float32)
+    y = np.array(labels, dtype=np.int32)
+    print(f"Loaded {len(y)} samples, {X.shape[1]} features")
+    print(f"  AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}")
+    return X, y
+def evaluate_predictions(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    y_prob: Optional[np.ndarray] = None,
+    title: str = "Model",
+) -> dict:
+    """
+    Compute and print all evaluation metrics.
+    Args:
+        y_true: Ground truth labels (0/1).
+        y_pred: Predicted labels (0/1).
+        y_prob: Predicted probabilities for positive class.
+        title: Title for the report.
+    Returns:
+        Dict of metric name -> value.
+    """
+    acc = accuracy_score(y_true, y_pred)
+    prec = precision_score(y_true, y_pred, zero_division=0)
+    rec = recall_score(y_true, y_pred, zero_division=0)
+    f1 = f1_score(y_true, y_pred, zero_division=0)
+    metrics = {
+        "accuracy": round(acc, 4),
+        "precision": round(prec, 4),
+        "recall": round(rec, 4),
+        "f1_score": round(f1, 4),
+    }
+    if y_prob is not None:
+        try:
+            auc = roc_auc_score(y_true, y_prob)
+            metrics["roc_auc"] = round(auc, 4)
+        except ValueError:
+            metrics["roc_auc"] = None
+    cm = confusion_matrix(y_true, y_pred)
+    # Print report
+    print(f"\n{'=' * 50}")
+    print(f"  {title} — Evaluation Report")
+    print(f"{'=' * 50}")
+    print(f"  Accuracy:  {acc:.4f} ({acc:.1%})")
+    print(f"  Precision: {prec:.4f}")
+    print(f"  Recall:    {rec:.4f}")
+    print(f"  F1 Score:  {f1:.4f}")
+    if "roc_auc" in metrics and metrics["roc_auc"] is not None:
+        print(f"  ROC-AUC:   {metrics['roc_auc']:.4f}")
+    print(f"\n  Confusion Matrix:")
+    print(f"                 Predicted")
+    print(f"  Actual    Human    AI")
+    print(f"  Human   {cm[0][0]:>6}  {cm[0][1]:>6}")
+    print(f"  AI      {cm[1][0]:>6}  {cm[1][1]:>6}")
+    print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}")
+    return metrics
+def evaluate_heuristic_baseline(features_csv: str | Path) -> dict:
+    """
+    Evaluate the current heuristic scoring system as baseline.
+    The heuristic system uses the 'spectral_regularity',
+    'temporal_patterns', 'harmonic_structure' scores
+    (which are sigmoid-transformed heuristics) to make
+    a weighted average prediction.
+    """
+    X, y = load_features_csv(features_csv)
+    # Read feature column names
+    with open(features_csv, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        feature_cols = [
+            c for c in reader.fieldnames
+            if c not in ("file_path", "label_int")
+        ]
+    # Find indices of heuristic score columns
+    sr_idx = feature_cols.index("spectral_regularity")
+    tp_idx = feature_cols.index("temporal_patterns")
+    hs_idx = feature_cols.index("harmonic_structure")
+    # Current heuristic: weighted average
+    heuristic_scores = (
+        X[:, sr_idx] * 0.35
+        + X[:, tp_idx] * 0.35
+        + X[:, hs_idx] * 0.30
+    )
+    # Also try with vocal score if available
+    vai_idx = feature_cols.index("vocal_ai_score")
+    has_v_idx = feature_cols.index("has_vocals")
+    combined_scores = np.where(
+        X[:, has_v_idx] > 0.5,
+        heuristic_scores * 0.65 + X[:, vai_idx] * 0.35,
+        heuristic_scores,
+    )
+    y_pred_heuristic = (heuristic_scores > 0.5).astype(int)
+    y_pred_combined = (combined_scores > 0.5).astype(int)
+    print("\n" + "=" * 60)
+    print("  BASELINE EVALUATION — Current Heuristic System")
+    print("=" * 60)
+    print("\n--- Heuristic Only (spectral + temporal + harmonic) ---")
+    m1 = evaluate_predictions(
+        y, y_pred_heuristic, heuristic_scores,
+        title="Heuristic (no vocals)",
+    )
+    print("\n--- Heuristic + Vocal Score ---")
+    m2 = evaluate_predictions(
+        y, y_pred_combined, combined_scores,
+        title="Heuristic + Vocals",
+    )
+    return {"heuristic_only": m1, "heuristic_vocals": m2}
+if __name__ == "__main__":
+    csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
+    evaluate_heuristic_baseline(csv_path)