import os import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader import config from models import get_dl_models, PhishingDataset from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import warnings warnings.filterwarnings('ignore', category=UserWarning) warnings.filterwarnings('ignore', category=FutureWarning) def prepare_data(df, numerical_cols): print("Preparing data for DL training...") X = df[numerical_cols].fillna(-1).values y = df['label'].values scaler = StandardScaler() X_scaled = scaler.fit_transform(X) return X_scaled, y, scaler def train_dl_model(model, train_loader, val_loader, device, epochs=50, lr=0.001): criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=lr) model.to(device) best_val_loss = float('inf') for epoch in range(epochs): model.train() train_loss = 0.0 for X_batch, y_batch in train_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() outputs = model(X_batch) loss = criterion(outputs, y_batch) loss.backward() optimizer.step() train_loss += loss.item() model.eval() val_loss = 0.0 correct = 0 total = 0 with torch.no_grad(): for X_batch, y_batch in val_loader: X_batch, y_batch = X_batch.to(device), y_batch.to(device) outputs = model(X_batch) loss = criterion(outputs, y_batch) val_loss += loss.item() predicted = (outputs > 0.5).float() total += y_batch.size(0) correct += (predicted == y_batch).sum().item() val_accuracy = correct / total if (epoch + 1) % 10 == 0: print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}") if val_loss < best_val_loss: best_val_loss = val_loss return model def main(): print("--- Starting DL Model Training ---") os.makedirs(config.MODELS_DIR, exist_ok=True) try: df = pd.read_csv(config.ENGINEERED_TRAIN_FILE) except FileNotFoundError: print(f"Error: '{config.ENGINEERED_TRAIN_FILE}' not found.") print("Please run `python data_pipeline.py` first.") return X_scaled, y, scaler = prepare_data(df, config.NUMERICAL_FEATURES) X_train, X_val, y_train, y_val = train_test_split( X_scaled, y, test_size=config.ML_TEST_SIZE, random_state=config.ML_MODEL_RANDOM_STATE, stratify=y ) print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.") train_dataset = PhishingDataset(X_train, y_train) val_dataset = PhishingDataset(X_val, y_val) train_loader = DataLoader(train_dataset, batch_size=config.DL_BATCH_SIZE, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=config.DL_BATCH_SIZE, shuffle=False) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") input_dim = X_train.shape[1] dl_models = get_dl_models(input_dim) for name, model in dl_models.items(): print(f"\n--- Training {name} ---") trained_model = train_dl_model( model, train_loader, val_loader, device, epochs=config.DL_EPOCHS, lr=config.DL_LEARNING_RATE ) save_path = os.path.join(config.MODELS_DIR, f"{name}.pt") torch.save(trained_model.state_dict(), save_path) print(f"Model saved to {save_path}") scaler_path = os.path.join(config.MODELS_DIR, "dl_scaler.pkl") import joblib joblib.dump(scaler, scaler_path) print(f"Scaler saved to {scaler_path}") print("\n--- DL Model Training Complete ---") if __name__ == "__main__": main()