""" Script to download all configs from the HuggingFace dataset. ```bash python download_datasets.py ``` This downloads all dataset configs as CSV files in the `data/` folder: - `data/datasets.csv` - `data/models.csv` - `data/shared_tasks.csv` - `data/events.csv` - `data/initiatives.csv` """ import os import pandas as pd from datasets import load_dataset # Dataset configuration DATASET_NAME = "somosnlp/recursos-pln-es" DATA_FOLDER = "data" # All available configs in the dataset CONFIGS = ["datasets", "models", "shared_tasks", "events", "initiatives"] def ensure_data_folder(): """Create data folder if it doesn't exist.""" if not os.path.exists(DATA_FOLDER): os.makedirs(DATA_FOLDER) print(f"📁 Created {DATA_FOLDER}/ folder") else: print(f"📁 Using existing {DATA_FOLDER}/ folder") def download_config(config_name: str): """Download a specific config and save as CSV.""" try: print(f"📥 Downloading {config_name} config...") # Load the dataset config dataset = load_dataset(DATASET_NAME, config_name, split="train") df = dataset.to_pandas() # Save as CSV csv_filename = os.path.join(DATA_FOLDER, f"{config_name}.csv") df.to_csv(csv_filename, index=False) print(f"✅ Saved {config_name}.csv ({df.shape[0]} rows, {df.shape[1]} columns)") print(f" Columns: {list(df.columns)}") return True except Exception as e: print(f"❌ Failed to download {config_name}: {e}") return False def download_all_datasets(): """Download all dataset configs as CSV files.""" print("🚀 Downloading all dataset configs from HuggingFace...") print(f"Dataset: {DATASET_NAME}") print(f"Target folder: {DATA_FOLDER}/") print() # Ensure data folder exists ensure_data_folder() print() # Download each config successful_downloads = 0 total_configs = len(CONFIGS) for config_name in CONFIGS: if download_config(config_name): successful_downloads += 1 print() # Add spacing between configs # Summary print("=" * 50) print(f"📊 Download Summary:") print(f" Total configs: {total_configs}") print(f" Successfully downloaded: {successful_downloads}") print(f" Failed: {total_configs - successful_downloads}") if successful_downloads == total_configs: print("🎉 All datasets downloaded successfully!") else: print("⚠️ Some downloads failed. Check the errors above.") print() print("📝 You can now edit the CSV files in the data/ folder:") for config_name in CONFIGS: csv_path = os.path.join(DATA_FOLDER, f"{config_name}.csv") if os.path.exists(csv_path): print(f" ✅ {csv_path}") else: print(f" ❌ {csv_path} (not downloaded)") if __name__ == "__main__": download_all_datasets()