Spaces:

mariagrandury
/

recursos-pln-es

Sleeping

App Files Files Community

recursos-pln-es / upload_datasets.py

mariagrandury

implement scripts to update datasets locally

51f7cd4 3 months ago

raw

history blame contribute delete

3.7 kB

	"""
	Script to upload edited CSV files from the data/ folder back to the HuggingFace dataset.

	Upload all configs:
	```bash
	python upload_datasets.py
	```

	Upload a single config:
	```bash
	python upload_datasets.py events
	python upload_datasets.py datasets
	python upload_datasets.py models
	python upload_datasets.py shared_tasks
	python upload_datasets.py initiatives
	```
	"""

	import os

	import pandas as pd
	from datasets import Dataset

	# Dataset configuration
	DATASET_NAME = "somosnlp/recursos-pln-es"
	DATA_FOLDER = "data"

	# All available configs in the dataset
	CONFIGS = ["datasets", "models", "shared_tasks", "events", "initiatives"]


	def upload_config(config_name: str):
	"""Upload a specific CSV file as a dataset config."""
	csv_filename = os.path.join(DATA_FOLDER, f"{config_name}.csv")

	if not os.path.exists(csv_filename):
	print(f"❌ File not found: {csv_filename}")
	return False

	try:
	print(f"📤 Uploading {config_name} config from {csv_filename}...")

	# Load CSV file
	df = pd.read_csv(csv_filename)
	print(f" 📊 Loaded {df.shape[0]} rows, {df.shape[1]} columns")

	# Convert to Dataset and push to hub
	dataset = Dataset.from_pandas(df)
	dataset.push_to_hub(
	DATASET_NAME,
	config_name=config_name,
	commit_message=f"Update {config_name} dataset",
	token=True,
	)

	print(f"✅ Successfully uploaded {config_name} config")
	return True

	except Exception as e:
	print(f"❌ Failed to upload {config_name}: {e}")
	return False


	def upload_all_datasets():
	"""Upload all CSV files as dataset configs."""
	print("🚀 Uploading CSV files to HuggingFace dataset...")
	print(f"Dataset: {DATASET_NAME}")
	print(f"Source folder: {DATA_FOLDER}/")
	print()

	# Check if data folder exists
	if not os.path.exists(DATA_FOLDER):
	print(f"❌ Error: {DATA_FOLDER}/ folder not found!")
	print(f"Run download_datasets.py first to create the CSV files.")
	return

	# Upload each config
	successful_uploads = 0
	total_configs = len(CONFIGS)

	for config_name in CONFIGS:
	if upload_config(config_name):
	successful_uploads += 1
	print() # Add spacing between configs

	# Summary
	print("=" * 50)
	print(f"📊 Upload Summary:")
	print(f" Total configs: {total_configs}")
	print(f" Successfully uploaded: {successful_uploads}")
	print(f" Failed: {total_configs - successful_uploads}")

	if successful_uploads == total_configs:
	print("🎉 All datasets uploaded successfully!")
	print(
	f"🔗 View updated dataset: https://huggingface.co/datasets/{DATASET_NAME}"
	)
	else:
	print("⚠️ Some uploads failed. Check the errors above.")


	def upload_single_config(config_name: str):
	"""Upload a single config by name."""
	if config_name not in CONFIGS:
	print(f"❌ Invalid config name: {config_name}")
	print(f"Available configs: {', '.join(CONFIGS)}")
	return

	print(f"🚀 Uploading single config: {config_name}")
	print()

	if upload_config(config_name):
	print("🎉 Upload completed successfully!")
	print(
	f"🔗 View updated dataset: https://huggingface.co/datasets/{DATASET_NAME}"
	)
	else:
	print("❌ Upload failed. Check the error above.")


	if __name__ == "__main__":
	import sys

	if len(sys.argv) > 1:
	# Upload specific config
	config_name = sys.argv[1]
	upload_single_config(config_name)
	else:
	# Upload all configs
	upload_all_datasets()