| import os | |
| import pandas as pd | |
| import joblib | |
| import config | |
| from models import get_ml_models | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.metrics import accuracy_score | |
| import warnings | |
| warnings.filterwarnings('ignore', category=UserWarning) | |
| warnings.filterwarnings('ignore', category=FutureWarning) | |
| def prepare_data(df, numerical_cols, categorical_cols): | |
| print("Preparing data for ML training...") | |
| X = df[numerical_cols + categorical_cols] | |
| y = df['label'] | |
| numerical_transformer = Pipeline(steps=[ | |
| ('imputer', 'passthrough'), | |
| ('scaler', StandardScaler()) | |
| ]) | |
| categorical_transformer = Pipeline(steps=[ | |
| ('imputer', 'passthrough'), | |
| ('onehot', OneHotEncoder(handle_unknown='ignore')) | |
| ]) | |
| X.loc[:, numerical_cols] = X.loc[:, numerical_cols].fillna(-1) | |
| X.loc[:, categorical_cols] = X.loc[:, categorical_cols].fillna('N/A') | |
| preprocessor = ColumnTransformer( | |
| transformers=[ | |
| ('num', numerical_transformer, numerical_cols), | |
| ('cat', categorical_transformer, categorical_cols) | |
| ], | |
| remainder='passthrough' | |
| ) | |
| return preprocessor, X, y | |
| def main(): | |
| print("--- Starting ML Model Training ---") | |
| os.makedirs(config.MODELS_DIR, exist_ok=True) | |
| try: | |
| df = pd.read_csv(config.ENGINEERED_TRAIN_FILE) | |
| except FileNotFoundError: | |
| print(f"Error: '{config.ENGINEERED_TRAIN_FILE}' not found.") | |
| print("Please run `python data_pipeline.py` first.") | |
| return | |
| preprocessor, X, y = prepare_data( | |
| df, | |
| config.NUMERICAL_FEATURES, | |
| config.CATEGORICAL_FEATURES | |
| ) | |
| X_train, X_val, y_train, y_val = train_test_split( | |
| X, y, | |
| test_size=config.ML_TEST_SIZE, | |
| random_state=config.ML_MODEL_RANDOM_STATE, | |
| stratify=y | |
| ) | |
| print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.") | |
| ml_models = get_ml_models() | |
| for name, model in ml_models.items(): | |
| print(f"\n--- Training {name} ---") | |
| model_pipeline = Pipeline(steps=[ | |
| ('preprocessor', preprocessor), | |
| ('classifier', model) | |
| ]) | |
| model_pipeline.fit(X_train, y_train) | |
| y_pred = model_pipeline.predict(X_val) | |
| val_accuracy = accuracy_score(y_val, y_pred) | |
| print(f"Validation Accuracy for {name}: {val_accuracy:.4f}") | |
| save_path = os.path.join(config.MODELS_DIR, f"{name}.joblib") | |
| joblib.dump(model_pipeline, save_path) | |
| print(f"Model saved to {save_path}") | |
| print("\n--- ML Model Training Complete ---") | |
| if __name__ == "__main__": | |
| main() |