Spaces:

Rthur2003
/

crowncode-backend

Sleeping

App Files Files Community

Rthur2003 commited on 29 days ago

Commit

57f19bf

1 Parent(s): 058eadc

feat: enhance candidate selection with class ratio and calibrated SVC for improved model performance

Browse files

Files changed (1) hide show

app/training/train_classifier.py +25 -36

app/training/train_classifier.py CHANGED Viewed

@@ -323,7 +323,7 @@ def _select_best_candidates(
     selected: list[tuple[str, Any]] = []
     tuning_results: dict[str, dict[str, Any]] = {}
-    for name, variants in _build_candidate_families().items():
         print("\n" + "." * 56)
         print(f"Selecting hyperparameters for: {name}")
         print("." * 56)
@@ -362,8 +362,14 @@ def _select_best_candidates(
     return selected, tuning_results
-def _build_candidate_families() -> dict[str, list[Any]]:
-    families: dict[str, list[Any]] = {
         "Logistic Regression": [
             LogisticRegression(
                 C=value,
@@ -435,38 +441,11 @@ def _build_candidate_families() -> dict[str, list[Any]]:
             ),
         ],
         "SVM (RBF)": [
-            SVC(
-                kernel="rbf",
-                C=1.0,
-                gamma="scale",
-                class_weight="balanced",
-                probability=True,
-                random_state=42,
-            ),
-            SVC(
-                kernel="rbf",
-                C=3.0,
-                gamma="scale",
-                class_weight="balanced",
-                probability=True,
-                random_state=42,
-            ),
-            SVC(
-                kernel="rbf",
-                C=6.0,
-                gamma=0.02,
-                class_weight="balanced",
-                probability=True,
-                random_state=42,
-            ),
-            SVC(
-                kernel="rbf",
-                C=10.0,
-                gamma=0.05,
-                class_weight="balanced",
-                probability=True,
-                random_state=42,
-            ),
         ],
         "MLP Neural Network": [
             MLPClassifier(
@@ -506,6 +485,7 @@ def _build_candidate_families() -> dict[str, list[Any]]:
     }
     if HAS_XGB:
         families["XGBoost"] = [
             xgb.XGBClassifier(
                 n_estimators=300,
@@ -517,6 +497,7 @@ def _build_candidate_families() -> dict[str, list[Any]]:
                 reg_alpha=0.2,
                 reg_lambda=1.2,
                 gamma=0.1,
                 eval_metric="logloss",
                 tree_method="hist",
                 random_state=42,
@@ -533,6 +514,7 @@ def _build_candidate_families() -> dict[str, list[Any]]:
                 reg_alpha=0.1,
                 reg_lambda=1.0,
                 gamma=0.0,
                 eval_metric="logloss",
                 tree_method="hist",
                 random_state=42,
@@ -549,6 +531,7 @@ def _build_candidate_families() -> dict[str, list[Any]]:
                 reg_alpha=0.4,
                 reg_lambda=1.5,
                 gamma=0.2,
                 eval_metric="logloss",
                 tree_method="hist",
                 random_state=42,
@@ -635,7 +618,13 @@ def _safe_model_name(name: str) -> str:
 def _summarize_selected_params(name: str, model: Any) -> dict[str, Any]:
     tuned_keys = _TUNED_PARAM_KEYS.get(name, ())
     params = model.get_params()
-    return {key: params[key] for key in tuned_keys if key in params}
 def _extract_importance(

     selected: list[tuple[str, Any]] = []
     tuning_results: dict[str, dict[str, Any]] = {}
+    for name, variants in _build_candidate_families(y_train).items():
         print("\n" + "." * 56)
         print(f"Selecting hyperparameters for: {name}")
         print("." * 56)
     return selected, tuning_results
+def _class_ratio(y: np.ndarray) -> float:
+    """Returns n_negative / n_positive for scale_pos_weight in XGBoost."""
+    n_pos = int(np.sum(y == 1))
+    n_neg = int(np.sum(y == 0))
+    return n_neg / n_pos if n_pos > 0 else 1.0
+def _build_candidate_families(y: np.ndarray) -> dict[str, list[Any]]:
         "Logistic Regression": [
             LogisticRegression(
                 C=value,
             ),
         ],
         "SVM (RBF)": [
+            CalibratedClassifierCV(
+                SVC(kernel="rbf", C=c, gamma=g, class_weight="balanced", random_state=42),
+                method="isotonic", cv=3,
+            )
+            for c, g in ((1.0, "scale"), (3.0, "scale"), (6.0, 0.02), (10.0, 0.05))
         ],
         "MLP Neural Network": [
             MLPClassifier(
     }
     if HAS_XGB:
+        _spw = _class_ratio(y)
         families["XGBoost"] = [
             xgb.XGBClassifier(
                 n_estimators=300,
                 reg_alpha=0.2,
                 reg_lambda=1.2,
                 gamma=0.1,
+                scale_pos_weight=_spw,
                 eval_metric="logloss",
                 tree_method="hist",
                 random_state=42,
                 reg_alpha=0.1,
                 reg_lambda=1.0,
                 gamma=0.0,
+                scale_pos_weight=_spw,
                 eval_metric="logloss",
                 tree_method="hist",
                 random_state=42,
                 reg_alpha=0.4,
                 reg_lambda=1.5,
                 gamma=0.2,
+                scale_pos_weight=_spw,
                 eval_metric="logloss",
                 tree_method="hist",
                 random_state=42,
 def _summarize_selected_params(name: str, model: Any) -> dict[str, Any]:
     tuned_keys = _TUNED_PARAM_KEYS.get(name, ())
     params = model.get_params()
+    # CalibratedClassifierCV nests params as "estimator__<key>"
+    flat: dict[str, Any] = {}
+    for key, value in params.items():
+        flat_key = key.split("__")[-1]
+        if flat_key not in flat:
+            flat[flat_key] = value
+    return {key: flat[key] for key in tuned_keys if key in flat}
 def _extract_importance(