Spaces:

Rthur2003
/

crowncode-backend

Sleeping

File size: 5,735 Bytes

"""
Evaluation framework for AURIS models.

Measures accuracy, precision, recall, F1, ROC-AUC
against labeled data. Used for:
  1. Baseline measurement of heuristic system
  2. Validation of trained models
  3. A/B comparison between model versions
"""

from __future__ import annotations

import csv
import sys
from pathlib import Path
from typing import Optional

import numpy as np

try:
    from sklearn.metrics import (
        accuracy_score,
        precision_score,
        recall_score,
        f1_score,
        roc_auc_score,
        confusion_matrix,
        classification_report,
    )
except ImportError:
    print("ERROR: scikit-learn required. pip install scikit-learn")
    sys.exit(1)


def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
    """
    Load features CSV into X (features) and y (labels).

    Returns:
        X: (n_samples, n_features) array
        y: (n_samples,) array of 0/1 labels
    """
    rows = []
    labels = []

    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        _EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
        feature_cols = [
            c for c in reader.fieldnames
            if c not in _EXCLUDE
        ]

        for row in reader:
            feat_values = []
            for col in feature_cols:
                try:
                    feat_values.append(float(row[col]))
                except (ValueError, KeyError):
                    feat_values.append(0.0)
            rows.append(feat_values)
            labels.append(int(row["label_int"]))

    X = np.array(rows, dtype=np.float32)
    y = np.array(labels, dtype=np.int32)

    print(f"Loaded {len(y)} samples, {X.shape[1]} features")
    print(f"  AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}")

    return X, y


def evaluate_predictions(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    y_prob: Optional[np.ndarray] = None,
    title: str = "Model",
) -> dict:
    """
    Compute and print all evaluation metrics.

    Args:
        y_true: Ground truth labels (0/1).
        y_pred: Predicted labels (0/1).
        y_prob: Predicted probabilities for positive class.
        title: Title for the report.

    Returns:
        Dict of metric name -> value.
    """
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    metrics = {
        "accuracy": round(acc, 4),
        "precision": round(prec, 4),
        "recall": round(rec, 4),
        "f1_score": round(f1, 4),
    }

    if y_prob is not None:
        try:
            auc = roc_auc_score(y_true, y_prob)
            metrics["roc_auc"] = round(auc, 4)
        except ValueError:
            metrics["roc_auc"] = None

    cm = confusion_matrix(y_true, y_pred)

    # Print report
    print(f"\n{'=' * 50}")
    print(f"  {title} - Evaluation Report")
    print(f"{'=' * 50}")
    print(f"  Accuracy:  {acc:.4f} ({acc:.1%})")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    if "roc_auc" in metrics and metrics["roc_auc"] is not None:
        print(f"  ROC-AUC:   {metrics['roc_auc']:.4f}")

    print(f"\n  Confusion Matrix:")
    print(f"                 Predicted")
    print(f"  Actual    Human    AI")
    print(f"  Human   {cm[0][0]:>6}  {cm[0][1]:>6}")
    print(f"  AI      {cm[1][0]:>6}  {cm[1][1]:>6}")

    print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}")

    return metrics


def evaluate_heuristic_baseline(features_csv: str | Path) -> dict:
    """
    Evaluate the current heuristic scoring system as baseline.

    The heuristic system uses the 'spectral_regularity',
    'temporal_patterns', 'harmonic_structure' scores
    (which are sigmoid-transformed heuristics) to make
    a weighted average prediction.
    """
    X, y = load_features_csv(features_csv)

    # Read feature column names
    with open(features_csv, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        feature_cols = [
            c for c in reader.fieldnames
            if c not in ("file_path", "label_int")
        ]

    # Find indices of heuristic score columns
    sr_idx = feature_cols.index("spectral_regularity")
    tp_idx = feature_cols.index("temporal_patterns")
    hs_idx = feature_cols.index("harmonic_structure")

    # Current heuristic: weighted average
    heuristic_scores = (
        X[:, sr_idx] * 0.35
        + X[:, tp_idx] * 0.35
        + X[:, hs_idx] * 0.30
    )

    # Also try with vocal score if available
    vai_idx = feature_cols.index("vocal_ai_score")
    has_v_idx = feature_cols.index("has_vocals")

    combined_scores = np.where(
        X[:, has_v_idx] > 0.5,
        heuristic_scores * 0.65 + X[:, vai_idx] * 0.35,
        heuristic_scores,
    )

    y_pred_heuristic = (heuristic_scores > 0.5).astype(int)
    y_pred_combined = (combined_scores > 0.5).astype(int)

    print("\n" + "=" * 60)
    print("  BASELINE EVALUATION - Current Heuristic System")
    print("=" * 60)

    print("\n--- Heuristic Only (spectral + temporal + harmonic) ---")
    m1 = evaluate_predictions(
        y, y_pred_heuristic, heuristic_scores,
        title="Heuristic (no vocals)",
    )

    print("\n--- Heuristic + Vocal Score ---")
    m2 = evaluate_predictions(
        y, y_pred_combined, combined_scores,
        title="Heuristic + Vocals",
    )

    return {"heuristic_only": m1, "heuristic_vocals": m2}


if __name__ == "__main__":
    csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
    evaluate_heuristic_baseline(csv_path)