recursos-pln-es / upload_datasets.py
mariagrandury's picture
implement scripts to update datasets locally
51f7cd4
"""
Script to upload edited CSV files from the data/ folder back to the HuggingFace dataset.
Upload all configs:
```bash
python upload_datasets.py
```
Upload a single config:
```bash
python upload_datasets.py events
python upload_datasets.py datasets
python upload_datasets.py models
python upload_datasets.py shared_tasks
python upload_datasets.py initiatives
```
"""
import os
import pandas as pd
from datasets import Dataset
# Dataset configuration
DATASET_NAME = "somosnlp/recursos-pln-es"
DATA_FOLDER = "data"
# All available configs in the dataset
CONFIGS = ["datasets", "models", "shared_tasks", "events", "initiatives"]
def upload_config(config_name: str):
"""Upload a specific CSV file as a dataset config."""
csv_filename = os.path.join(DATA_FOLDER, f"{config_name}.csv")
if not os.path.exists(csv_filename):
print(f"❌ File not found: {csv_filename}")
return False
try:
print(f"πŸ“€ Uploading {config_name} config from {csv_filename}...")
# Load CSV file
df = pd.read_csv(csv_filename)
print(f" πŸ“Š Loaded {df.shape[0]} rows, {df.shape[1]} columns")
# Convert to Dataset and push to hub
dataset = Dataset.from_pandas(df)
dataset.push_to_hub(
DATASET_NAME,
config_name=config_name,
commit_message=f"Update {config_name} dataset",
token=True,
)
print(f"βœ… Successfully uploaded {config_name} config")
return True
except Exception as e:
print(f"❌ Failed to upload {config_name}: {e}")
return False
def upload_all_datasets():
"""Upload all CSV files as dataset configs."""
print("πŸš€ Uploading CSV files to HuggingFace dataset...")
print(f"Dataset: {DATASET_NAME}")
print(f"Source folder: {DATA_FOLDER}/")
print()
# Check if data folder exists
if not os.path.exists(DATA_FOLDER):
print(f"❌ Error: {DATA_FOLDER}/ folder not found!")
print(f"Run download_datasets.py first to create the CSV files.")
return
# Upload each config
successful_uploads = 0
total_configs = len(CONFIGS)
for config_name in CONFIGS:
if upload_config(config_name):
successful_uploads += 1
print() # Add spacing between configs
# Summary
print("=" * 50)
print(f"πŸ“Š Upload Summary:")
print(f" Total configs: {total_configs}")
print(f" Successfully uploaded: {successful_uploads}")
print(f" Failed: {total_configs - successful_uploads}")
if successful_uploads == total_configs:
print("πŸŽ‰ All datasets uploaded successfully!")
print(
f"πŸ”— View updated dataset: https://huggingface.co/datasets/{DATASET_NAME}"
)
else:
print("⚠️ Some uploads failed. Check the errors above.")
def upload_single_config(config_name: str):
"""Upload a single config by name."""
if config_name not in CONFIGS:
print(f"❌ Invalid config name: {config_name}")
print(f"Available configs: {', '.join(CONFIGS)}")
return
print(f"πŸš€ Uploading single config: {config_name}")
print()
if upload_config(config_name):
print("πŸŽ‰ Upload completed successfully!")
print(
f"πŸ”— View updated dataset: https://huggingface.co/datasets/{DATASET_NAME}"
)
else:
print("❌ Upload failed. Check the error above.")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
# Upload specific config
config_name = sys.argv[1]
upload_single_config(config_name)
else:
# Upload all configs
upload_all_datasets()