import os import glob import pandas as pd import config from feature_extraction import extract_features_from_dataframe def load_and_sample_raw_data(data_dir, fraction=0.1, random_state=42): raw_data_files = glob.glob(os.path.join(data_dir, "*.csv")) if not raw_data_files: print(f"Error: No .csv files found in '{data_dir}'.") print("Please add your raw data files (e.g., phishing.csv, legit.csv) to the /data/ folder.") return pd.DataFrame() print(f"Found {len(raw_data_files)} raw data files.") all_samples = [] for file_path in raw_data_files: try: print(f"Loading and sampling {os.path.basename(file_path)}...") df = pd.read_csv(file_path, on_bad_lines='skip') if 'label' not in df.columns or 'url' not in df.columns: print(f"Warning: Skipping {file_path}. Must contain 'label' and 'url' columns.") continue sample_df = df.sample(frac=fraction, random_state=random_state) all_samples.append(sample_df) except Exception as e: print(f"Error processing {file_path}: {e}") if not all_samples: print("Error: No valid data could be loaded.") return pd.DataFrame() combined_df = pd.concat(all_samples, ignore_index=True) combined_df = combined_df.sample(frac=0.1, random_state=random_state).reset_index(drop=True) print(f"Total raw training data prepared: {len(combined_df)} samples.") return combined_df def main(): print("--- Starting Data Pipeline ---") raw_df = load_and_sample_raw_data( data_dir=config.DATA_DIR, fraction=config.TRAIN_SAMPLE_FRACTION ) if raw_df.empty: print("Data pipeline failed. Exiting.") return engineered_df = extract_features_from_dataframe(raw_df) engineered_df.to_csv(config.ENGINEERED_TRAIN_FILE, index=False) print(f"\n--- Data Pipeline Complete ---") print(f"Engineered training set saved to: {config.ENGINEERED_TRAIN_FILE}") print(f"Total features: {len(config.ALL_FEATURE_COLUMNS)}") if __name__ == "__main__": os.makedirs(config.DATA_DIR, exist_ok=True) if not glob.glob(os.path.join(config.DATA_DIR, "*.csv")): print("Creating dummy data files...") dummy_phish = pd.DataFrame({ 'label': [1, 1], 'url': ['facebook.com.login-support.ru', 'myetherwallets.kr/wallet'] }) dummy_phish.to_csv(os.path.join(config.DATA_DIR, 'phishing_data_1.csv'), index=False) dummy_legit = pd.DataFrame({ 'label': [0, 0], 'url': ['google.com', 'https://www.millect.com/Plans'] }) dummy_legit.to_csv(os.path.join(config.DATA_DIR, 'legit_data_1.csv'), index=False) print(f"Dummy files created in {config.DATA_DIR}. Please replace them with your real data.") main()