Rthur2003 commited on
Commit
bc1975b
·
1 Parent(s): bfbcec4

feat: add evaluation framework for AURIS models with metrics computation

Browse files
Files changed (1) hide show
  1. app/training/evaluate.py +198 -0
app/training/evaluate.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation framework for AURIS models.
3
+
4
+ Measures accuracy, precision, recall, F1, ROC-AUC
5
+ against labeled data. Used for:
6
+ 1. Baseline measurement of heuristic system
7
+ 2. Validation of trained models
8
+ 3. A/B comparison between model versions
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import csv
14
+ import sys
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ import numpy as np
19
+
20
+ try:
21
+ from sklearn.metrics import (
22
+ accuracy_score,
23
+ precision_score,
24
+ recall_score,
25
+ f1_score,
26
+ roc_auc_score,
27
+ confusion_matrix,
28
+ classification_report,
29
+ )
30
+ except ImportError:
31
+ print("ERROR: scikit-learn required. pip install scikit-learn")
32
+ sys.exit(1)
33
+
34
+
35
+ def load_features_csv(path: str | Path) -> tuple[np.ndarray, np.ndarray]:
36
+ """
37
+ Load features CSV into X (features) and y (labels).
38
+
39
+ Returns:
40
+ X: (n_samples, n_features) array
41
+ y: (n_samples,) array of 0/1 labels
42
+ """
43
+ rows = []
44
+ labels = []
45
+
46
+ with open(path, "r", encoding="utf-8") as f:
47
+ reader = csv.DictReader(f)
48
+ feature_cols = [
49
+ c for c in reader.fieldnames
50
+ if c not in ("file_path", "label_int")
51
+ ]
52
+
53
+ for row in reader:
54
+ feat_values = []
55
+ for col in feature_cols:
56
+ try:
57
+ feat_values.append(float(row[col]))
58
+ except (ValueError, KeyError):
59
+ feat_values.append(0.0)
60
+ rows.append(feat_values)
61
+ labels.append(int(row["label_int"]))
62
+
63
+ X = np.array(rows, dtype=np.float32)
64
+ y = np.array(labels, dtype=np.int32)
65
+
66
+ print(f"Loaded {len(y)} samples, {X.shape[1]} features")
67
+ print(f" AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}")
68
+
69
+ return X, y
70
+
71
+
72
+ def evaluate_predictions(
73
+ y_true: np.ndarray,
74
+ y_pred: np.ndarray,
75
+ y_prob: Optional[np.ndarray] = None,
76
+ title: str = "Model",
77
+ ) -> dict:
78
+ """
79
+ Compute and print all evaluation metrics.
80
+
81
+ Args:
82
+ y_true: Ground truth labels (0/1).
83
+ y_pred: Predicted labels (0/1).
84
+ y_prob: Predicted probabilities for positive class.
85
+ title: Title for the report.
86
+
87
+ Returns:
88
+ Dict of metric name -> value.
89
+ """
90
+ acc = accuracy_score(y_true, y_pred)
91
+ prec = precision_score(y_true, y_pred, zero_division=0)
92
+ rec = recall_score(y_true, y_pred, zero_division=0)
93
+ f1 = f1_score(y_true, y_pred, zero_division=0)
94
+
95
+ metrics = {
96
+ "accuracy": round(acc, 4),
97
+ "precision": round(prec, 4),
98
+ "recall": round(rec, 4),
99
+ "f1_score": round(f1, 4),
100
+ }
101
+
102
+ if y_prob is not None:
103
+ try:
104
+ auc = roc_auc_score(y_true, y_prob)
105
+ metrics["roc_auc"] = round(auc, 4)
106
+ except ValueError:
107
+ metrics["roc_auc"] = None
108
+
109
+ cm = confusion_matrix(y_true, y_pred)
110
+
111
+ # Print report
112
+ print(f"\n{'=' * 50}")
113
+ print(f" {title} — Evaluation Report")
114
+ print(f"{'=' * 50}")
115
+ print(f" Accuracy: {acc:.4f} ({acc:.1%})")
116
+ print(f" Precision: {prec:.4f}")
117
+ print(f" Recall: {rec:.4f}")
118
+ print(f" F1 Score: {f1:.4f}")
119
+ if "roc_auc" in metrics and metrics["roc_auc"] is not None:
120
+ print(f" ROC-AUC: {metrics['roc_auc']:.4f}")
121
+
122
+ print(f"\n Confusion Matrix:")
123
+ print(f" Predicted")
124
+ print(f" Actual Human AI")
125
+ print(f" Human {cm[0][0]:>6} {cm[0][1]:>6}")
126
+ print(f" AI {cm[1][0]:>6} {cm[1][1]:>6}")
127
+
128
+ print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}")
129
+
130
+ return metrics
131
+
132
+
133
+ def evaluate_heuristic_baseline(features_csv: str | Path) -> dict:
134
+ """
135
+ Evaluate the current heuristic scoring system as baseline.
136
+
137
+ The heuristic system uses the 'spectral_regularity',
138
+ 'temporal_patterns', 'harmonic_structure' scores
139
+ (which are sigmoid-transformed heuristics) to make
140
+ a weighted average prediction.
141
+ """
142
+ X, y = load_features_csv(features_csv)
143
+
144
+ # Read feature column names
145
+ with open(features_csv, "r", encoding="utf-8") as f:
146
+ reader = csv.DictReader(f)
147
+ feature_cols = [
148
+ c for c in reader.fieldnames
149
+ if c not in ("file_path", "label_int")
150
+ ]
151
+
152
+ # Find indices of heuristic score columns
153
+ sr_idx = feature_cols.index("spectral_regularity")
154
+ tp_idx = feature_cols.index("temporal_patterns")
155
+ hs_idx = feature_cols.index("harmonic_structure")
156
+
157
+ # Current heuristic: weighted average
158
+ heuristic_scores = (
159
+ X[:, sr_idx] * 0.35
160
+ + X[:, tp_idx] * 0.35
161
+ + X[:, hs_idx] * 0.30
162
+ )
163
+
164
+ # Also try with vocal score if available
165
+ vai_idx = feature_cols.index("vocal_ai_score")
166
+ has_v_idx = feature_cols.index("has_vocals")
167
+
168
+ combined_scores = np.where(
169
+ X[:, has_v_idx] > 0.5,
170
+ heuristic_scores * 0.65 + X[:, vai_idx] * 0.35,
171
+ heuristic_scores,
172
+ )
173
+
174
+ y_pred_heuristic = (heuristic_scores > 0.5).astype(int)
175
+ y_pred_combined = (combined_scores > 0.5).astype(int)
176
+
177
+ print("\n" + "=" * 60)
178
+ print(" BASELINE EVALUATION — Current Heuristic System")
179
+ print("=" * 60)
180
+
181
+ print("\n--- Heuristic Only (spectral + temporal + harmonic) ---")
182
+ m1 = evaluate_predictions(
183
+ y, y_pred_heuristic, heuristic_scores,
184
+ title="Heuristic (no vocals)",
185
+ )
186
+
187
+ print("\n--- Heuristic + Vocal Score ---")
188
+ m2 = evaluate_predictions(
189
+ y, y_pred_combined, combined_scores,
190
+ title="Heuristic + Vocals",
191
+ )
192
+
193
+ return {"heuristic_only": m1, "heuristic_vocals": m2}
194
+
195
+
196
+ if __name__ == "__main__":
197
+ csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
198
+ evaluate_heuristic_baseline(csv_path)