import os import pandas as pd import numpy as np import joblib import torch import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix, accuracy_score, classification_report from sklearn.preprocessing import StandardScaler import config from models import get_dl_models, PhishingDataset, FinetunedBERT sns.set_style("whitegrid") plt.rcParams['figure.figsize'] = (12, 8) plt.rcParams['font.size'] = 11 COLORS = { 'primary': '#FF6B6B', 'secondary': '#4ECDC4', 'tertiary': '#45B7D1', 'quaternary': '#FFA07A', 'quinary': '#98D8C8', 'bg': '#F7F7F7', 'text': '#2C3E50' } MODEL_THRESHOLDS = { 'attention_blstm': 0.8, 'rcnn': 0.8, 'logistic': 0.5, 'svm': 0.5, 'xgboost': 0.5, 'bert': 0.5 } def load_sample_data(sample_fraction=0.05): print(f"Loading {sample_fraction*100}% sample from data...") if os.path.exists(config.ENGINEERED_TEST_FILE): df = pd.read_csv(config.ENGINEERED_TEST_FILE) print(f"Loaded test data: {len(df)} samples") elif os.path.exists(config.ENGINEERED_TRAIN_FILE): df = pd.read_csv(config.ENGINEERED_TRAIN_FILE) print(f"Loaded train data: {len(df)} samples") else: data_files = [ os.path.join(config.DATA_DIR, 'url_data_labeled.csv'), os.path.join(config.DATA_DIR, 'data_bal - 20000.csv') ] df = None for file in data_files: if os.path.exists(file): df = pd.read_csv(file) print(f"Loaded raw data: {len(df)} samples") break if df is None: raise FileNotFoundError("No data file found!") sample_size = max(int(len(df) * sample_fraction), config.REPORT_SAMPLE_SIZE) sample_size = min(sample_size, len(df)) df_sample = df.sample(n=sample_size, random_state=42) print(f"Sampled {len(df_sample)} URLs for report generation") return df_sample def prepare_ml_data(df): X = df[config.NUMERICAL_FEATURES + config.CATEGORICAL_FEATURES] y = df['label'].values X.loc[:, config.NUMERICAL_FEATURES] = X.loc[:, config.NUMERICAL_FEATURES].fillna(-1) X.loc[:, config.CATEGORICAL_FEATURES] = X.loc[:, config.CATEGORICAL_FEATURES].fillna('N/A') return X, y def prepare_dl_data(df): X = df[config.NUMERICAL_FEATURES].fillna(-1).values y = df['label'].values scaler_path = os.path.join(config.MODELS_DIR, "dl_scaler.pkl") if os.path.exists(scaler_path): scaler = joblib.load(scaler_path) X_scaled = scaler.transform(X) else: scaler = StandardScaler() X_scaled = scaler.fit_transform(X) return X_scaled, y def predict_ml_models(X, y): predictions = {} scores = {} ml_models = ['logistic', 'svm', 'xgboost'] for model_name in ml_models: model_path = os.path.join(config.MODELS_DIR, f"{model_name}.joblib") if not os.path.exists(model_path): print(f"WARNING: Model {model_name} not found, skipping...") continue print(f"Loading {model_name} model...") model = joblib.load(model_path) y_pred = model.predict(X) y_proba = model.predict_proba(X)[:, 1] predictions[model_name] = y_pred scores[model_name] = y_proba acc = accuracy_score(y, y_pred) print(f" {model_name} accuracy: {acc:.4f}") return predictions, scores def predict_dl_models(X, y): predictions = {} scores = {} device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') input_dim = X.shape[1] dl_models_dict = get_dl_models(input_dim) for model_name, model in dl_models_dict.items(): model_path = os.path.join(config.MODELS_DIR, f"{model_name}.pt") if not os.path.exists(model_path): print(f"WARNING: Model {model_name} not found, skipping...") continue print(f"Loading {model_name} model...") model.load_state_dict(torch.load(model_path, map_location=device, weights_only=True)) model.to(device) model.eval() X_tensor = torch.tensor(X, dtype=torch.float32).to(device) with torch.no_grad(): outputs = model(X_tensor).cpu().numpy().flatten() threshold = MODEL_THRESHOLDS.get(model_name, 0.5) y_pred = (outputs > threshold).astype(int) predictions[model_name] = y_pred scores[model_name] = outputs acc = accuracy_score(y, y_pred) print(f" {model_name} accuracy: {acc:.4f} (threshold: {threshold})") del model, X_tensor if torch.cuda.is_available(): torch.cuda.empty_cache() return predictions, scores def predict_bert_model(df, y): bert_path = os.path.join(config.BASE_DIR, 'finetuned_bert') if not os.path.exists(bert_path): print(f"WARNING: BERT model not found at {bert_path}, skipping...") return None, None if 'url' not in df.columns: print("WARNING: 'url' column not found in data, skipping BERT...") return None, None try: print("Loading BERT model...") bert_model = FinetunedBERT(bert_path) urls = df['url'].tolist() batch_size = 32 all_preds = [] all_probas = [] print(f"Processing {len(urls)} URLs in batches of {batch_size}...") for i in range(0, len(urls), batch_size): batch_urls = urls[i:i+batch_size] batch_preds = bert_model.predict(batch_urls) batch_probas = bert_model.predict_proba(batch_urls)[:, 1] all_preds.extend(batch_preds) all_probas.extend(batch_probas) if torch.cuda.is_available(): torch.cuda.empty_cache() y_pred = 1-np.array(all_preds) y_proba = 1-np.array(all_probas) acc = accuracy_score(y, y_pred) print(f" BERT accuracy: {acc:.4f}") return y_pred, y_proba except torch.cuda.OutOfMemoryError: print("WARNING: CUDA out of memory for BERT model, skipping...") print(" Try reducing batch size or use CPU by setting CUDA_VISIBLE_DEVICES=''") return None, None except Exception as e: print(f"WARNING: Error loading BERT model: {e}") return None, None def plot_confusion_matrices(y_true, all_predictions, save_dir): print("\nGenerating confusion matrices...") n_models = len(all_predictions) if n_models == 0: print("No predictions to plot!") return cols = min(3, n_models) rows = (n_models + cols - 1) // cols fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 5*rows)) if n_models == 1: axes = [axes] else: axes = axes.flatten() if rows > 1 else axes cmap = sns.color_palette("RdYlGn_r", as_cmap=True) for idx, (model_name, y_pred) in enumerate(all_predictions.items()): ax = axes[idx] cm = confusion_matrix(y_true, y_pred) sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, ax=ax, cbar_kws={'label': 'Count'}, annot_kws={'size': 14, 'weight': 'bold'}) ax.set_title(f'{model_name.upper()} Confusion Matrix', fontsize=14, fontweight='bold', color=COLORS['text']) ax.set_xlabel('Predicted Label', fontsize=12, fontweight='bold') ax.set_ylabel('True Label', fontsize=12, fontweight='bold') ax.set_xticklabels(['Legitimate (0)', 'Phishing (1)']) ax.set_yticklabels(['Legitimate (0)', 'Phishing (1)']) for idx in range(n_models, len(axes)): fig.delaxes(axes[idx]) plt.tight_layout() save_path = os.path.join(save_dir, 'confusion_matrices.png') plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white') print(f"Saved confusion matrices to {save_path}") plt.close() def plot_accuracy_comparison(y_true, all_predictions, save_dir): print("\nGenerating accuracy comparison plot...") if len(all_predictions) == 0: print("No predictions to plot!") return accuracies = {} for model_name, y_pred in all_predictions.items(): acc = accuracy_score(y_true, y_pred) accuracies[model_name] = acc models = list(accuracies.keys()) accs = list(accuracies.values()) colors_list = [COLORS['primary'], COLORS['secondary'], COLORS['tertiary'], COLORS['quaternary'], COLORS['quinary']] bar_colors = [colors_list[i % len(colors_list)] for i in range(len(models))] fig, ax = plt.subplots(figsize=(12, 7)) bars = ax.bar(models, accs, color=bar_colors, edgecolor='black', linewidth=2, alpha=0.8) for bar, acc in zip(bars, accs): height = bar.get_height() ax.text(bar.get_x() + bar.get_width()/2., height + 0.01, f'{acc:.4f}', ha='center', va='bottom', fontsize=13, fontweight='bold') ax.set_xlabel('Models', fontsize=14, fontweight='bold', color=COLORS['text']) ax.set_ylabel('Accuracy', fontsize=14, fontweight='bold', color=COLORS['text']) ax.set_title('Model Accuracy Comparison', fontsize=18, fontweight='bold', color=COLORS['text'], pad=20) ax.set_ylim([0, 1.1]) ax.grid(axis='y', alpha=0.3, linestyle='--') ax.set_axisbelow(True) plt.xticks(rotation=45, ha='right', fontsize=12) plt.tight_layout() save_path = os.path.join(save_dir, 'accuracy_comparison.png') plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white') print(f"Saved accuracy comparison to {save_path}") plt.close() def plot_score_vs_label(y_true, all_scores, save_dir): print("\nGenerating score vs label scatter plots...") if len(all_scores) == 0: print("No scores to plot!") return n_models = len(all_scores) cols = min(3, n_models) rows = (n_models + cols - 1) // cols fig, axes = plt.subplots(rows, cols, figsize=(6*cols, 5*rows)) if n_models == 1: axes = [axes] else: axes = axes.flatten() if rows > 1 else axes colors_map = {0: COLORS['secondary'], 1: COLORS['primary']} for idx, (model_name, scores) in enumerate(all_scores.items()): ax = axes[idx] for label in [0, 1]: mask = y_true == label label_name = 'Legitimate' if label == 0 else 'Phishing' ax.scatter(np.where(mask)[0], scores[mask], c=colors_map[label], label=label_name, alpha=0.6, s=50, edgecolors='black', linewidth=0.5) threshold = MODEL_THRESHOLDS.get(model_name, 0.5) ax.axhline(y=threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold ({threshold})', alpha=0.7) ax.set_title(f'{model_name.upper()} Prediction Scores', fontsize=14, fontweight='bold', color=COLORS['text']) ax.set_xlabel('Sample Index', fontsize=11, fontweight='bold') ax.set_ylabel('Prediction Score', fontsize=11, fontweight='bold') ax.set_ylim([-0.1, 1.1]) ax.legend(loc='best', framealpha=0.9) ax.grid(True, alpha=0.3, linestyle='--') for idx in range(n_models, len(axes)): fig.delaxes(axes[idx]) plt.tight_layout() save_path = os.path.join(save_dir, 'score_vs_label.png') plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white') print(f"Saved score vs label plots to {save_path}") plt.close() def main(): print("="*60) print("PHISHING DETECTION MODEL EVALUATION REPORT") print("="*60) print("\nCustom Thresholds Configuration:") for model, threshold in MODEL_THRESHOLDS.items(): print(f" • {model}: {threshold}") print() os.makedirs(config.REPORTS_DIR, exist_ok=True) os.makedirs(config.MODELS_DIR, exist_ok=True) df = load_sample_data(sample_fraction=0.05) all_predictions = {} all_scores = {} X_ml, y = prepare_ml_data(df) ml_preds, ml_scores = predict_ml_models(X_ml, y) all_predictions.update(ml_preds) all_scores.update(ml_scores) X_dl, y_dl = prepare_dl_data(df) dl_preds, dl_scores = predict_dl_models(X_dl, y_dl) all_predictions.update(dl_preds) all_scores.update(dl_scores) bert_pred, bert_score = predict_bert_model(df, y) if bert_pred is not None: all_predictions['bert'] = bert_pred all_scores['bert'] = bert_score if len(all_predictions) == 0: print("\nWARNING: No models found! Please train models first.") print("Run: python train_ml.py && python train_dl.py") return plot_confusion_matrices(y, all_predictions, config.REPORTS_DIR) plot_accuracy_comparison(y, all_predictions, config.REPORTS_DIR) plot_score_vs_label(y, all_scores, config.REPORTS_DIR) print("\n" + "="*60) print("REPORT GENERATION COMPLETE!") print(f"All visualizations saved to: {config.REPORTS_DIR}") print("="*60) if __name__ == "__main__": main()