sgAtdbd commited on
Commit
249be5e
Β·
verified Β·
1 Parent(s): 387ca50

Update models/train_model.py

Browse files
Files changed (1) hide show
  1. models/train_model.py +66 -3
models/train_model.py CHANGED
@@ -7,6 +7,7 @@ Compares multiple algorithms and saves the best one
7
  import pandas as pd
8
  import numpy as np
9
  from sklearn.model_selection import train_test_split
 
10
  from sklearn.feature_extraction.text import TfidfVectorizer
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.ensemble import RandomForestClassifier
@@ -169,6 +170,58 @@ def analyze_distribution(df: pd.DataFrame, name: str):
169
  label_name = label_names.get(label, f'Unknown({label})')
170
  print(f" {label} - {label_name:20s}: {count:6,} ({percentage:5.1f}%)")
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
173
  """Train a single model and return results"""
174
  print(f"\n πŸ”§ Training {model_type.upper()}...")
@@ -182,7 +235,10 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
182
  n_jobs=-1
183
  )
184
  elif model_type == 'svm':
185
- model = LinearSVC(
 
 
 
186
  random_state=RANDOM_STATE,
187
  class_weight='balanced',
188
  max_iter=2000
@@ -197,7 +253,7 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
197
  else:
198
  raise ValueError(f"Unknown model type: {model_type}")
199
 
200
- # Train
201
  start_time = time.time()
202
 
203
  model.fit(X_train, y_train)
@@ -213,6 +269,14 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
213
  print(f" βœ“ F1-Score: {f1:.4f}")
214
  print(f" βœ“ Time: {training_time:.2f}s")
215
 
 
 
 
 
 
 
 
 
216
  return {
217
  'model': model,
218
  'accuracy': accuracy,
@@ -220,7 +284,6 @@ def train_single_model(X_train, X_test, y_train, y_test, model_type: str, langua
220
  'training_time': training_time,
221
  'predictions': y_pred
222
  }
223
-
224
  def train_and_compare_models(X_train, X_test, y_train, y_test, language: str) -> Tuple:
225
  """Train multiple models and return the best one"""
226
  print(f"\nπŸ€– Training Multiple Models for {language.upper()}...")
 
7
  import pandas as pd
8
  import numpy as np
9
  from sklearn.model_selection import train_test_split
10
+ from sklearn.svm import SVC
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.linear_model import LogisticRegression
13
  from sklearn.ensemble import RandomForestClassifier
 
170
  label_name = label_names.get(label, f'Unknown({label})')
171
  print(f" {label} - {label_name:20s}: {count:6,} ({percentage:5.1f}%)")
172
 
173
+ # def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
174
+ # """Train a single model and return results"""
175
+ # print(f"\n πŸ”§ Training {model_type.upper()}...")
176
+
177
+ # # Choose model
178
+ # if model_type == 'logistic':
179
+ # model = LogisticRegression(
180
+ # max_iter=1000,
181
+ # random_state=RANDOM_STATE,
182
+ # class_weight='balanced',
183
+ # n_jobs=-1
184
+ # )
185
+ # elif model_type == 'svm':
186
+ # model = LinearSVC(
187
+ # random_state=RANDOM_STATE,
188
+ # class_weight='balanced',
189
+ # max_iter=2000
190
+ # )
191
+ # elif model_type == 'random_forest':
192
+ # model = RandomForestClassifier(
193
+ # n_estimators=100,
194
+ # random_state=RANDOM_STATE,
195
+ # class_weight='balanced',
196
+ # n_jobs=-1
197
+ # )
198
+ # else:
199
+ # raise ValueError(f"Unknown model type: {model_type}")
200
+
201
+ # # Train
202
+ # start_time = time.time()
203
+
204
+ # model.fit(X_train, y_train)
205
+ # y_pred = model.predict(X_test)
206
+
207
+ # training_time = time.time() - start_time
208
+
209
+ # # Evaluate
210
+ # accuracy = accuracy_score(y_test, y_pred)
211
+ # f1 = f1_score(y_test, y_pred, average='weighted')
212
+
213
+ # print(f" βœ“ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
214
+ # print(f" βœ“ F1-Score: {f1:.4f}")
215
+ # print(f" βœ“ Time: {training_time:.2f}s")
216
+
217
+ # return {
218
+ # 'model': model,
219
+ # 'accuracy': accuracy,
220
+ # 'f1_score': f1,
221
+ # 'training_time': training_time,
222
+ # 'predictions': y_pred
223
+ # }
224
+
225
  def train_single_model(X_train, X_test, y_train, y_test, model_type: str, language: str) -> Dict:
226
  """Train a single model and return results"""
227
  print(f"\n πŸ”§ Training {model_type.upper()}...")
 
235
  n_jobs=-1
236
  )
237
  elif model_type == 'svm':
238
+ # βœ… Use SVC instead of LinearSVC
239
+ model = SVC(
240
+ kernel='linear',
241
+ probability=True, # βœ… CRITICAL: Enable probability estimates
242
  random_state=RANDOM_STATE,
243
  class_weight='balanced',
244
  max_iter=2000
 
253
  else:
254
  raise ValueError(f"Unknown model type: {model_type}")
255
 
256
+ # βœ… ADD THIS: Train and evaluate
257
  start_time = time.time()
258
 
259
  model.fit(X_train, y_train)
 
269
  print(f" βœ“ F1-Score: {f1:.4f}")
270
  print(f" βœ“ Time: {training_time:.2f}s")
271
 
272
+ # βœ… Verify predict_proba works
273
+ if hasattr(model, 'predict_proba'):
274
+ proba = model.predict_proba(X_test[:1])
275
+ print(f" βœ… predict_proba: Available (shape: {proba.shape})")
276
+ else:
277
+ print(f" ⚠️ predict_proba: NOT Available")
278
+
279
+ # βœ… ADD THIS: Return results
280
  return {
281
  'model': model,
282
  'accuracy': accuracy,
 
284
  'training_time': training_time,
285
  'predictions': y_pred
286
  }
 
287
  def train_and_compare_models(X_train, X_test, y_train, y_test, language: str) -> Tuple:
288
  """Train multiple models and return the best one"""
289
  print(f"\nπŸ€– Training Multiple Models for {language.upper()}...")