Spaces:

Rthur2003
/

crowncode-backend

Sleeping

App Files Files Community

Rthur2003 commited on Mar 28

Commit

337d9ae

1 Parent(s): b74a8cb

feat: add AURIS classifier training module with model evaluation and feature importance

Browse files

Files changed (1) hide show

app/training/train_classifier.py +237 -0

app/training/train_classifier.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Train AURIS classifier on extracted audio features.
+Increment 1: RandomForest / GradientBoosting on librosa + vocal features.
+This replaces the heuristic scoring with a data-driven classifier.
+Usage:
+    python -m app.training.train_classifier data/sonics/features.csv
+Outputs:
+    models/auris_classifier_v1.pkl   — trained model
+    models/feature_scaler_v1.pkl     — fitted StandardScaler
+    models/feature_columns_v1.json   — ordered feature column names
+"""
+from __future__ import annotations
+import csv
+import json
+import pickle
+import sys
+from pathlib import Path
+import numpy as np
+from sklearn.ensemble import (
+    GradientBoostingClassifier,
+    RandomForestClassifier,
+)
+from sklearn.model_selection import (
+    StratifiedKFold,
+    cross_val_predict,
+)
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    roc_auc_score,
+)
+# Optional: LightGBM for better performance
+try:
+    import lightgbm as lgb
+    HAS_LGBM = True
+except ImportError:
+    HAS_LGBM = False
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from app.training.evaluate import (
+    load_features_csv,
+    evaluate_predictions,
+)
+def train(
+    features_csv: str | Path,
+    models_dir: str | Path = "models",
+    n_folds: int = 5,
+) -> dict:
+    """
+    Train and evaluate classifier on extracted features.
+    Uses 5-fold cross-validation to estimate real accuracy,
+    then trains final model on all data.
+    Returns:
+        Dict with metrics and model paths.
+    """
+    models_dir = Path(models_dir)
+    models_dir.mkdir(parents=True, exist_ok=True)
+    # ── Load data ──────────────────────────────────
+    X, y = load_features_csv(features_csv)
+    # Get feature column names
+    with open(features_csv, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        feature_cols = [
+            c for c in reader.fieldnames
+            if c not in ("file_path", "label_int")
+        ]
+    # ── Handle NaN/Inf ─────────────────────────────
+    X = np.nan_to_num(X, nan=0.0, posinf=1.0, neginf=-1.0)
+    # ── Scale features ─────────────────────────────
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    # ── Train multiple models, pick best ───────────
+    candidates = _build_candidates()
+    best_model = None
+    best_name = ""
+    best_auc = 0.0
+    results = {}
+    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
+    for name, model in candidates:
+        print(f"\n{'─' * 40}")
+        print(f"Training: {name}")
+        print(f"{'─' * 40}")
+        # Cross-validated predictions
+        y_prob = cross_val_predict(
+            model, X_scaled, y,
+            cv=cv, method="predict_proba",
+        )[:, 1]
+        y_pred = (y_prob > 0.5).astype(int)
+        acc = accuracy_score(y, y_pred)
+        f1 = f1_score(y, y_pred)
+        auc = roc_auc_score(y, y_prob)
+        print(f"  CV Accuracy: {acc:.4f}")
+        print(f"  CV F1:       {f1:.4f}")
+        print(f"  CV ROC-AUC:  {auc:.4f}")
+        results[name] = {
+            "accuracy": round(acc, 4),
+            "f1": round(f1, 4),
+            "roc_auc": round(auc, 4),
+        }
+        if auc > best_auc:
+            best_auc = auc
+            best_name = name
+            best_model = model
+    # ── Final evaluation of best model ─────────────
+    print(f"\n{'=' * 50}")
+    print(f"  Best model: {best_name} (AUC={best_auc:.4f})")
+    print(f"{'=' * 50}")
+    # Cross-val predictions for detailed report
+    y_prob_best = cross_val_predict(
+        best_model, X_scaled, y,
+        cv=cv, method="predict_proba",
+    )[:, 1]
+    y_pred_best = (y_prob_best > 0.5).astype(int)
+    evaluate_predictions(
+        y, y_pred_best, y_prob_best,
+        title=f"Best: {best_name}",
+    )
+    # ── Train final model on ALL data ──────────────
+    print(f"\nTraining final {best_name} on all data...")
+    best_model.fit(X_scaled, y)
+    # ── Feature importance ─────────────────────────
+    if hasattr(best_model, "feature_importances_"):
+        importances = best_model.feature_importances_
+        top_features = sorted(
+            zip(feature_cols, importances),
+            key=lambda x: x[1],
+            reverse=True,
+        )
+        print("\nTop 10 features:")
+        for fname, imp in top_features[:10]:
+            bar = "█" * int(imp * 100)
+            print(f"  {fname:<30} {imp:.4f} {bar}")
+    # ── Save artifacts ─────────────────────────────
+    model_path = models_dir / "auris_classifier_v1.pkl"
+    scaler_path = models_dir / "feature_scaler_v1.pkl"
+    columns_path = models_dir / "feature_columns_v1.json"
+    with open(model_path, "wb") as f:
+        pickle.dump(best_model, f)
+    with open(scaler_path, "wb") as f:
+        pickle.dump(scaler, f)
+    with open(columns_path, "w") as f:
+        json.dump(feature_cols, f, indent=2)
+    print(f"\nSaved:")
+    print(f"  Model:   {model_path}")
+    print(f"  Scaler:  {scaler_path}")
+    print(f"  Columns: {columns_path}")
+    return {
+        "best_model": best_name,
+        "best_auc": best_auc,
+        "results": results,
+        "model_path": str(model_path),
+    }
+def _build_candidates() -> list[tuple[str, object]]:
+    """Build list of classifier candidates to evaluate."""
+    candidates = [
+        (
+            "RandomForest",
+            RandomForestClassifier(
+                n_estimators=300,
+                max_depth=20,
+                min_samples_leaf=5,
+                class_weight="balanced",
+                random_state=42,
+                n_jobs=-1,
+            ),
+        ),
+        (
+            "GradientBoosting",
+            GradientBoostingClassifier(
+                n_estimators=200,
+                max_depth=6,
+                learning_rate=0.1,
+                subsample=0.8,
+                random_state=42,
+            ),
+        ),
+    ]
+    if HAS_LGBM:
+        candidates.append((
+            "LightGBM",
+            lgb.LGBMClassifier(
+                n_estimators=300,
+                max_depth=8,
+                learning_rate=0.05,
+                num_leaves=31,
+                subsample=0.8,
+                colsample_bytree=0.8,
+                class_weight="balanced",
+                random_state=42,
+                verbose=-1,
+            ),
+        ))
+    return candidates
+if __name__ == "__main__":
+    csv_path = sys.argv[1] if len(sys.argv) > 1 else "data/sonics/features.csv"
+    model_dir = sys.argv[2] if len(sys.argv) > 2 else "models"
+    train(csv_path, model_dir)