Spaces:

Rthur2003
/

crowncode-backend

Sleeping

App Files Files Community

crowncode-backend / app /training /evaluate.py

Rthur2003

Add Gradio dependency to requirements.txt with version constraints

20fe6c3 about 1 month ago

raw

history blame contribute delete

5.74 kB

	"""
	Evaluation framework for AURIS models.

	Measures accuracy, precision, recall, F1, ROC-AUC
	against labeled data. Used for:
	1. Baseline measurement of heuristic system
	2. Validation of trained models
	3. A/B comparison between model versions
	"""

	from __future__ import annotations

	import csv
	import sys
	from pathlib import Path
	from typing import Optional

	import numpy as np

	try:
	from sklearn.metrics import (
	accuracy_score,
	precision_score,
	recall_score,
	f1_score,
	roc_auc_score,
	confusion_matrix,
	classification_report,
	)
	except ImportError:
	print("ERROR: scikit-learn required. pip install scikit-learn")
	sys.exit(1)


	def load_features_csv(path: str \| Path) -> tuple[np.ndarray, np.ndarray]:
	"""
	Load features CSV into X (features) and y (labels).

	Returns:
	X: (n_samples, n_features) array
	y: (n_samples,) array of 0/1 labels
	"""
	rows = []
	labels = []

	with open(path, "r", encoding="utf-8") as f:
	reader = csv.DictReader(f)
	_EXCLUDE = {"file_path", "label_int", "duration_sec", "sample_rate"}
	feature_cols = [
	c for c in reader.fieldnames
	if c not in _EXCLUDE
	]

	for row in reader:
	feat_values = []
	for col in feature_cols:
	try:
	feat_values.append(float(row[col]))
	except (ValueError, KeyError):
	feat_values.append(0.0)
	rows.append(feat_values)
	labels.append(int(row["label_int"]))

	X = np.array(rows, dtype=np.float32)
	y = np.array(labels, dtype=np.int32)

	print(f"Loaded {len(y)} samples, {X.shape[1]} features")
	print(f" AI: {np.sum(y == 1)}, Human: {np.sum(y == 0)}")

	return X, y


	def evaluate_predictions(
	y_true: np.ndarray,
	y_pred: np.ndarray,
	y_prob: Optional[np.ndarray] = None,
	title: str = "Model",
	) -> dict:
	"""
	Compute and print all evaluation metrics.

	Args:
	y_true: Ground truth labels (0/1).
	y_pred: Predicted labels (0/1).
	y_prob: Predicted probabilities for positive class.
	title: Title for the report.

	Returns:
	Dict of metric name -> value.
	"""
	acc = accuracy_score(y_true, y_pred)
	prec = precision_score(y_true, y_pred, zero_division=0)
	rec = recall_score(y_true, y_pred, zero_division=0)
	f1 = f1_score(y_true, y_pred, zero_division=0)

	metrics = {
	"accuracy": round(acc, 4),
	"precision": round(prec, 4),
	"recall": round(rec, 4),
	"f1_score": round(f1, 4),
	}

	if y_prob is not None:
	try:
	auc = roc_auc_score(y_true, y_prob)
	metrics["roc_auc"] = round(auc, 4)
	except ValueError:
	metrics["roc_auc"] = None

	cm = confusion_matrix(y_true, y_pred)

	# Print report
	print(f"\n{'=' * 50}")
	print(f" {title} - Evaluation Report")
	print(f"{'=' * 50}")
	print(f" Accuracy: {acc:.4f} ({acc:.1%})")
	print(f" Precision: {prec:.4f}")
	print(f" Recall: {rec:.4f}")
	print(f" F1 Score: {f1:.4f}")
	if "roc_auc" in metrics and metrics["roc_auc"] is not None:
	print(f" ROC-AUC: {metrics['roc_auc']:.4f}")

	print(f"\n Confusion Matrix:")
	print(f" Predicted")
	print(f" Actual Human AI")
	print(f" Human {cm[0][0]:>6} {cm[0][1]:>6}")
	print(f" AI {cm[1][0]:>6} {cm[1][1]:>6}")

	print(f"\n{classification_report(y_true, y_pred, target_names=['Human', 'AI'])}")

	return metrics


	def evaluate_heuristic_baseline(features_csv: str \| Path) -> dict:
	"""
	Evaluate the current heuristic scoring system as baseline.

	The heuristic system uses the 'spectral_regularity',
	'temporal_patterns', 'harmonic_structure' scores
	(which are sigmoid-transformed heuristics) to make
	a weighted average prediction.
	"""
	X, y = load_features_csv(features_csv)

	# Read feature column names
	with open(features_csv, "r", encoding="utf-8") as f:
	reader = csv.DictReader(f)
	feature_cols = [
	c for c in reader.fieldnames
	if c not in ("file_path", "label_int")
	]

	# Find indices of heuristic score columns
	sr_idx = feature_cols.index("spectral_regularity")
	tp_idx = feature_cols.index("temporal_patterns")
	hs_idx = feature_cols.index("harmonic_structure")

	# Current heuristic: weighted average
	heuristic_scores = (
	X[:, sr_idx] * 0.35
	+ X[:, tp_idx] * 0.35
	+ X[:, hs_idx] * 0.30
	)

	# Also try with vocal score if available
	vai_idx = feature_cols.index("vocal_ai_score")
	has_v_idx = feature_cols.index("has_vocals")

	combined_scores = np.where(
	X[:, has_v_idx] > 0.5,
	heuristic_scores * 0.65 + X[:, vai_idx] * 0.35,
	heuristic_scores,
	)

	y_pred_heuristic = (heuristic_scores > 0.5).astype(int)
	y_pred_combined = (combined_scores > 0.5).astype(int)

	print("\n" + "=" * 60)
	print(" BASELINE EVALUATION - Current Heuristic System")
	print("=" * 60)

	print("\n--- Heuristic Only (spectral + temporal + harmonic) ---")
	m1 = evaluate_predictions(
	y, y_pred_heuristic, heuristic_scores,
	title="Heuristic (no vocals)",
	)

	print("\n--- Heuristic + Vocal Score ---")
	m2 = evaluate_predictions(
	y, y_pred_combined, combined_scores,
	title="Heuristic + Vocals",
	)

	return {"heuristic_only": m1, "heuristic_vocals": m2}


	if __name__ == "__main__":
	csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
	evaluate_heuristic_baseline(csv_path)