Spaces:

Rthur2003
/

crowncode-backend

Sleeping

App Files Files Community

Rthur2003 commited on Apr 16

Commit

bb6655d

1 Parent(s): b8d143b

feat: update model parameters and fix data leakage by removing duration_sec and sample_rate from features

Browse files

Files changed (1) hide show

app/training/train_classifier.py +17 -8

app/training/train_classifier.py CHANGED Viewed

@@ -214,6 +214,7 @@ def train(
     json_results["_n_samples"] = len(y)
     json_results["_n_features"] = X.shape[1]
     json_results["_n_folds"] = n_folds
     if importance_data:
         json_results["_feature_importance"] = {
             name: round(imp, 6) for name, imp in importance_data
@@ -252,9 +253,10 @@ def _build_candidates() -> list[tuple[str, Any]]:
         (
             "Random Forest",
             RandomForestClassifier(
-                n_estimators=300,
-                max_depth=20,
-                min_samples_leaf=5,
                 class_weight="balanced",
                 random_state=42,
                 n_jobs=-1,
@@ -301,11 +303,15 @@ def _build_candidates() -> list[tuple[str, Any]]:
         candidates.append((
             "XGBoost",
             xgb.XGBClassifier(
-                n_estimators=300,
-                max_depth=8,
                 learning_rate=0.05,
                 subsample=0.8,
                 colsample_bytree=0.8,
                 scale_pos_weight=1.0,
                 eval_metric="logloss",
                 random_state=42,
@@ -317,12 +323,15 @@ def _build_candidates() -> list[tuple[str, Any]]:
         candidates.append((
             "LightGBM",
             lgb.LGBMClassifier(
-                n_estimators=300,
-                max_depth=8,
                 learning_rate=0.05,
-                num_leaves=31,
                 subsample=0.8,
                 colsample_bytree=0.8,
                 class_weight="balanced",
                 random_state=42,
                 verbose=-1,

     json_results["_n_samples"] = len(y)
     json_results["_n_features"] = X.shape[1]
     json_results["_n_folds"] = n_folds
+    json_results["_data_leakage_fix"] = "duration_sec and sample_rate removed from features (v2)"
     if importance_data:
         json_results["_feature_importance"] = {
             name: round(imp, 6) for name, imp in importance_data
         (
             "Random Forest",
             RandomForestClassifier(
+                n_estimators=200,
+                max_depth=12,
+                min_samples_leaf=8,
+                min_samples_split=10,
                 class_weight="balanced",
                 random_state=42,
                 n_jobs=-1,
         candidates.append((
             "XGBoost",
             xgb.XGBClassifier(
+                n_estimators=200,
+                max_depth=5,
                 learning_rate=0.05,
                 subsample=0.8,
                 colsample_bytree=0.8,
+                min_child_weight=5,
+                reg_alpha=0.1,
+                reg_lambda=1.0,
+                gamma=0.1,
                 scale_pos_weight=1.0,
                 eval_metric="logloss",
                 random_state=42,
         candidates.append((
             "LightGBM",
             lgb.LGBMClassifier(
+                n_estimators=200,
+                max_depth=5,
                 learning_rate=0.05,
+                num_leaves=24,
                 subsample=0.8,
                 colsample_bytree=0.8,
+                min_child_weight=5,
+                reg_alpha=0.1,
+                reg_lambda=1.0,
                 class_weight="balanced",
                 random_state=42,
                 verbose=-1,