File size: 2,819 Bytes
e2e0c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
import pandas as pd
import joblib
import config
from models import get_ml_models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
def prepare_data(df, numerical_cols, categorical_cols):
print("Preparing data for ML training...")
X = df[numerical_cols + categorical_cols]
y = df['label']
numerical_transformer = Pipeline(steps=[
('imputer', 'passthrough'),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', 'passthrough'),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
X.loc[:, numerical_cols] = X.loc[:, numerical_cols].fillna(-1)
X.loc[:, categorical_cols] = X.loc[:, categorical_cols].fillna('N/A')
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
],
remainder='passthrough'
)
return preprocessor, X, y
def main():
print("--- Starting ML Model Training ---")
os.makedirs(config.MODELS_DIR, exist_ok=True)
try:
df = pd.read_csv(config.ENGINEERED_TRAIN_FILE)
except FileNotFoundError:
print(f"Error: '{config.ENGINEERED_TRAIN_FILE}' not found.")
print("Please run `python data_pipeline.py` first.")
return
preprocessor, X, y = prepare_data(
df,
config.NUMERICAL_FEATURES,
config.CATEGORICAL_FEATURES
)
X_train, X_val, y_train, y_val = train_test_split(
X, y,
test_size=config.ML_TEST_SIZE,
random_state=config.ML_MODEL_RANDOM_STATE,
stratify=y
)
print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
ml_models = get_ml_models()
for name, model in ml_models.items():
print(f"\n--- Training {name} ---")
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', model)
])
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy for {name}: {val_accuracy:.4f}")
save_path = os.path.join(config.MODELS_DIR, f"{name}.joblib")
joblib.dump(model_pipeline, save_path)
print(f"Model saved to {save_path}")
print("\n--- ML Model Training Complete ---")
if __name__ == "__main__":
main() |