AEGIS-SECURE-API / train_dl.py
Akshat Bhatt
added code
e2e0c18
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import config
from models import get_dl_models, PhishingDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
def prepare_data(df, numerical_cols):
print("Preparing data for DL training...")
X = df[numerical_cols].fillna(-1).values
y = df['label'].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
return X_scaled, y, scaler
def train_dl_model(model, train_loader, val_loader, device, epochs=50, lr=0.001):
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
model.to(device)
best_val_loss = float('inf')
for epoch in range(epochs):
model.train()
train_loss = 0.0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
train_loss += loss.item()
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for X_batch, y_batch in val_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
val_loss += loss.item()
predicted = (outputs > 0.5).float()
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
val_accuracy = correct / total
if (epoch + 1) % 10 == 0:
print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val Acc: {val_accuracy:.4f}")
if val_loss < best_val_loss:
best_val_loss = val_loss
return model
def main():
print("--- Starting DL Model Training ---")
os.makedirs(config.MODELS_DIR, exist_ok=True)
try:
df = pd.read_csv(config.ENGINEERED_TRAIN_FILE)
except FileNotFoundError:
print(f"Error: '{config.ENGINEERED_TRAIN_FILE}' not found.")
print("Please run `python data_pipeline.py` first.")
return
X_scaled, y, scaler = prepare_data(df, config.NUMERICAL_FEATURES)
X_train, X_val, y_train, y_val = train_test_split(
X_scaled, y,
test_size=config.ML_TEST_SIZE,
random_state=config.ML_MODEL_RANDOM_STATE,
stratify=y
)
print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
train_dataset = PhishingDataset(X_train, y_train)
val_dataset = PhishingDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=config.DL_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.DL_BATCH_SIZE, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
input_dim = X_train.shape[1]
dl_models = get_dl_models(input_dim)
for name, model in dl_models.items():
print(f"\n--- Training {name} ---")
trained_model = train_dl_model(
model, train_loader, val_loader, device,
epochs=config.DL_EPOCHS,
lr=config.DL_LEARNING_RATE
)
save_path = os.path.join(config.MODELS_DIR, f"{name}.pt")
torch.save(trained_model.state_dict(), save_path)
print(f"Model saved to {save_path}")
scaler_path = os.path.join(config.MODELS_DIR, "dl_scaler.pkl")
import joblib
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")
print("\n--- DL Model Training Complete ---")
if __name__ == "__main__":
main()