Spaces:
Sleeping
Sleeping
Update models/train_model.py
Browse files- models/train_model.py +66 -3
models/train_model.py
CHANGED
|
@@ -7,6 +7,7 @@ Compares multiple algorithms and saves the best one
|
|
| 7 |
import pandas as pd
|
| 8 |
import numpy as np
|
| 9 |
from sklearn.model_selection import train_test_split
|
|
|
|
| 10 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
from sklearn.linear_model import LogisticRegression
|
| 12 |
from sklearn.ensemble import RandomForestClassifier
|
|
@@ -169,6 +170,58 @@ def analyze_distribution(df: pd.DataFrame, name: str):
|
|
| 169 |
label_name = label_names.get(label, f'Unknown({label})')
|
| 170 |
print(f" {label} - {label_name:20s}: {count:6,} ({percentage:5.1f}%)")
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
|
| 173 |
"""Train a single model and return results"""
|
| 174 |
print(f"\n π§ Training {model_type.upper()}...")
|
|
@@ -182,7 +235,10 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
|
|
| 182 |
n_jobs=-1
|
| 183 |
)
|
| 184 |
elif model_type == 'svm':
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
| 186 |
random_state=RANDOM_STATE,
|
| 187 |
class_weight='balanced',
|
| 188 |
max_iter=2000
|
|
@@ -197,7 +253,7 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
|
|
| 197 |
else:
|
| 198 |
raise ValueError(f"Unknown model type: {model_type}")
|
| 199 |
|
| 200 |
-
# Train
|
| 201 |
start_time = time.time()
|
| 202 |
|
| 203 |
model.fit(X_train, y_train)
|
|
@@ -213,6 +269,14 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
|
|
| 213 |
print(f" β F1-Score: {f1:.4f}")
|
| 214 |
print(f" β Time: {training_time:.2f}s")
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
return {
|
| 217 |
'model': model,
|
| 218 |
'accuracy': accuracy,
|
|
@@ -220,7 +284,6 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
|
|
| 220 |
'training_time': training_time,
|
| 221 |
'predictions': y_pred
|
| 222 |
}
|
| 223 |
-
|
| 224 |
def train_and_compare_models(X_train, X_test, y_train, y_test, language: str) -> Tuple:
|
| 225 |
"""Train multiple models and return the best one"""
|
| 226 |
print(f"\nπ€ Training Multiple Models for {language.upper()}...")
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
import numpy as np
|
| 9 |
from sklearn.model_selection import train_test_split
|
| 10 |
+
from sklearn.svm import SVC
|
| 11 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
from sklearn.linear_model import LogisticRegression
|
| 13 |
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
| 170 |
label_name = label_names.get(label, f'Unknown({label})')
|
| 171 |
print(f" {label} - {label_name:20s}: {count:6,} ({percentage:5.1f}%)")
|
| 172 |
|
| 173 |
+
# def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
|
| 174 |
+
# """Train a single model and return results"""
|
| 175 |
+
# print(f"\n π§ Training {model_type.upper()}...")
|
| 176 |
+
|
| 177 |
+
# # Choose model
|
| 178 |
+
# if model_type == 'logistic':
|
| 179 |
+
# model = LogisticRegression(
|
| 180 |
+
# max_iter=1000,
|
| 181 |
+
# random_state=RANDOM_STATE,
|
| 182 |
+
# class_weight='balanced',
|
| 183 |
+
# n_jobs=-1
|
| 184 |
+
# )
|
| 185 |
+
# elif model_type == 'svm':
|
| 186 |
+
# model = LinearSVC(
|
| 187 |
+
# random_state=RANDOM_STATE,
|
| 188 |
+
# class_weight='balanced',
|
| 189 |
+
# max_iter=2000
|
| 190 |
+
# )
|
| 191 |
+
# elif model_type == 'random_forest':
|
| 192 |
+
# model = RandomForestClassifier(
|
| 193 |
+
# n_estimators=100,
|
| 194 |
+
# random_state=RANDOM_STATE,
|
| 195 |
+
# class_weight='balanced',
|
| 196 |
+
# n_jobs=-1
|
| 197 |
+
# )
|
| 198 |
+
# else:
|
| 199 |
+
# raise ValueError(f"Unknown model type: {model_type}")
|
| 200 |
+
|
| 201 |
+
# # Train
|
| 202 |
+
# start_time = time.time()
|
| 203 |
+
|
| 204 |
+
# model.fit(X_train, y_train)
|
| 205 |
+
# y_pred = model.predict(X_test)
|
| 206 |
+
|
| 207 |
+
# training_time = time.time() - start_time
|
| 208 |
+
|
| 209 |
+
# # Evaluate
|
| 210 |
+
# accuracy = accuracy_score(y_test, y_pred)
|
| 211 |
+
# f1 = f1_score(y_test, y_pred, average='weighted')
|
| 212 |
+
|
| 213 |
+
# print(f" β Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
|
| 214 |
+
# print(f" β F1-Score: {f1:.4f}")
|
| 215 |
+
# print(f" β Time: {training_time:.2f}s")
|
| 216 |
+
|
| 217 |
+
# return {
|
| 218 |
+
# 'model': model,
|
| 219 |
+
# 'accuracy': accuracy,
|
| 220 |
+
# 'f1_score': f1,
|
| 221 |
+
# 'training_time': training_time,
|
| 222 |
+
# 'predictions': y_pred
|
| 223 |
+
# }
|
| 224 |
+
|
| 225 |
def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
|
| 226 |
"""Train a single model and return results"""
|
| 227 |
print(f"\n π§ Training {model_type.upper()}...")
|
|
|
|
| 235 |
n_jobs=-1
|
| 236 |
)
|
| 237 |
elif model_type == 'svm':
|
| 238 |
+
# β
Use SVC instead of LinearSVC
|
| 239 |
+
model = SVC(
|
| 240 |
+
kernel='linear',
|
| 241 |
+
probability=True, # β
CRITICAL: Enable probability estimates
|
| 242 |
random_state=RANDOM_STATE,
|
| 243 |
class_weight='balanced',
|
| 244 |
max_iter=2000
|
|
|
|
| 253 |
else:
|
| 254 |
raise ValueError(f"Unknown model type: {model_type}")
|
| 255 |
|
| 256 |
+
# β
ADD THIS: Train and evaluate
|
| 257 |
start_time = time.time()
|
| 258 |
|
| 259 |
model.fit(X_train, y_train)
|
|
|
|
| 269 |
print(f" β F1-Score: {f1:.4f}")
|
| 270 |
print(f" β Time: {training_time:.2f}s")
|
| 271 |
|
| 272 |
+
# β
Verify predict_proba works
|
| 273 |
+
if hasattr(model, 'predict_proba'):
|
| 274 |
+
proba = model.predict_proba(X_test[:1])
|
| 275 |
+
print(f" β
predict_proba: Available (shape: {proba.shape})")
|
| 276 |
+
else:
|
| 277 |
+
print(f" β οΈ predict_proba: NOT Available")
|
| 278 |
+
|
| 279 |
+
# β
ADD THIS: Return results
|
| 280 |
return {
|
| 281 |
'model': model,
|
| 282 |
'accuracy': accuracy,
|
|
|
|
| 284 |
'training_time': training_time,
|
| 285 |
'predictions': y_pred
|
| 286 |
}
|
|
|
|
| 287 |
def train_and_compare_models(X_train, X_test, y_train, y_test, language: str) -> Tuple:
|
| 288 |
"""Train multiple models and return the best one"""
|
| 289 |
print(f"\nπ€ Training Multiple Models for {language.upper()}...")
|