File size: 2,819 Bytes
e2e0c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
import joblib
import config
from models import get_ml_models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

def prepare_data(df, numerical_cols, categorical_cols):
    print("Preparing data for ML training...")
    X = df[numerical_cols + categorical_cols]
    y = df['label']
    numerical_transformer = Pipeline(steps=[
        ('imputer', 'passthrough'),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', 'passthrough'),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    X.loc[:, numerical_cols] = X.loc[:, numerical_cols].fillna(-1)
    X.loc[:, categorical_cols] = X.loc[:, categorical_cols].fillna('N/A')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='passthrough'
    )
    
    return preprocessor, X, y

def main():
    print("--- Starting ML Model Training ---")
    os.makedirs(config.MODELS_DIR, exist_ok=True)
    try:
        df = pd.read_csv(config.ENGINEERED_TRAIN_FILE)
    except FileNotFoundError:
        print(f"Error: '{config.ENGINEERED_TRAIN_FILE}' not found.")
        print("Please run `python data_pipeline.py` first.")
        return

    preprocessor, X, y = prepare_data(
        df, 
        config.NUMERICAL_FEATURES, 
        config.CATEGORICAL_FEATURES
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=config.ML_TEST_SIZE, 
        random_state=config.ML_MODEL_RANDOM_STATE,
        stratify=y
    )
    
    print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
    ml_models = get_ml_models()
    for name, model in ml_models.items():
        print(f"\n--- Training {name} ---")
        
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_pred)
        print(f"Validation Accuracy for {name}: {val_accuracy:.4f}")
        
        save_path = os.path.join(config.MODELS_DIR, f"{name}.joblib")
        joblib.dump(model_pipeline, save_path)
        print(f"Model saved to {save_path}")

    print("\n--- ML Model Training Complete ---")

if __name__ == "__main__":
    main()