Spaces:

mariagrandury
/

recursos-pln-es

Sleeping

App Files Files Community

recursos-pln-es / datasets_resource.py

mariagrandury

fix oauth config

1af5fea 3 months ago

raw

history blame contribute delete

22.4 kB

	from datetime import datetime, timezone

	import gradio as gr
	import pandas as pd
	from datasets import Dataset, load_dataset

	from constants import (
	COUNTRIES,
	DATASET_TYPES,
	DOMAINS,
	LANGUAGES,
	TASKS,
	format_dataframe_for_display,
	format_dataframe_for_html_display,
	)

	# Dataset configuration
	DATASET_NAME = "somosnlp/recursos-pln-es"
	CONFIG_NAME = "datasets"
	RESOURCE_TYPE = "datasets"
	RESOURCE_TITLE = "Datasets"


	def load_data() -> pd.DataFrame:
	"""Load data from HuggingFace dataset or return empty DataFrame."""
	try:
	dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
	return dataset.to_pandas()
	except Exception as e:
	print(f"Could not load {RESOURCE_TYPE} dataset: {e}")
	# Return empty DataFrame with required columns
	return pd.DataFrame(
	columns=[
	"name",
	"github_url",
	"huggingface_url",
	"zenodo_url",
	"paper_url",
	"dataset_type",
	"task",
	"domain",
	"website_url",
	"countries",
	"languages",
	"submitted_by",
	"date_submitted",
	]
	)


	def search_and_filter_data(df: pd.DataFrame, search_query: str) -> pd.DataFrame:
	"""Filter dataframe based on search query."""
	if search_query == "":
	return df
	else:
	filtered_df = df[
	df.apply(
	lambda row: row.astype(str)
	.str.contains(search_query, case=False)
	.any(),
	axis=1,
	)
	]
	return filtered_df


	def validate_url(url: str) -> bool:
	"""Validate if a string is a valid URL."""
	if not url:
	return True # Empty URLs are allowed for optional fields
	return url.startswith(("http://", "https://"))


	def submit_resource(
	name: str,
	github_url: str,
	huggingface_url: str,
	zenodo_url: str,
	paper_url: str,
	dataset_type: str,
	task: list,
	domain: list,
	website_url: str,
	countries: list,
	languages: list,
	profile: gr.OAuthProfile \| None,
	):
	"""Submit a new resource to the corresponding dataset."""

	# Login required
	if profile is None:
	return "❌ Error: You need to be logged in to submit a resource."

	# Validate required fields
	if not name:
	return "❌ Error: Name is required."

	if not dataset_type:
	return "❌ Error: Dataset type is required."

	# Validate that at least one URL is provided
	if not any([github_url, huggingface_url, zenodo_url]):
	return "❌ Error: At least one of GitHub URL, Hugging Face URL, or Zenodo URL must be provided."

	# Validate URLs
	urls_to_check = [
	("GitHub URL", github_url),
	("Hugging Face URL", huggingface_url),
	("Zenodo URL", zenodo_url),
	("Paper URL", paper_url),
	("Website URL", website_url),
	]

	for url_name, url_value in urls_to_check:
	if url_value and not validate_url(url_value):
	return f"❌ Error: {url_name} must be a valid URL starting with http:// or https://"

	try:
	username = profile.username
	current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

	# Create new row data
	new_data = {
	"name": name,
	"github_url": github_url,
	"huggingface_url": huggingface_url,
	"zenodo_url": zenodo_url,
	"paper_url": paper_url,
	"dataset_type": dataset_type,
	"task": ", ".join(task) if task else "",
	"domain": ", ".join(domain) if domain else "",
	"website_url": website_url,
	"countries": ", ".join(countries) if countries else "",
	"languages": ", ".join(languages) if languages else "",
	"submitted_by": username,
	"date_submitted": current_time,
	}

	# Try to load existing dataset, or create new one
	try:
	existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
	existing_df = existing_dataset.to_pandas()
	# Add new row
	updated_df = pd.concat(
	[existing_df, pd.DataFrame([new_data])], ignore_index=True
	)
	except:
	# Create new dataset if it doesn't exist
	updated_df = pd.DataFrame([new_data])

	# Convert back to Dataset and push to hub
	updated_dataset = Dataset.from_pandas(updated_df)
	updated_dataset.push_to_hub(
	DATASET_NAME,
	config_name=CONFIG_NAME,
	commit_message=f"Add {name} by {username}",
	token=True, # Use the user's token
	)

	return f"✅ Success: {name} has been submitted successfully!"

	except Exception as e:
	return f"❌ Error: Failed to submit resource. {str(e)}"


	def create_all_tab():
	"""Create the 'All' tab for this resource type."""
	with gr.TabItem("📋 All", id=f"{RESOURCE_TYPE}_all"):
	gr.Markdown(f"### All {RESOURCE_TITLE}")

	search_box = gr.Textbox(
	placeholder=f"Search {RESOURCE_TYPE}...",
	label="Filter the table",
	show_label=False,
	)

	# Load and format initial data with clickable links
	def get_formatted_data():
	df = load_data()
	return format_dataframe_for_display(
	df,
	url_columns=[
	"github_url",
	"huggingface_url",
	"zenodo_url",
	"paper_url",
	"website_url",
	],
	hide_columns=["date_submitted"],
	)

	# Use Dataframe component with HTML rendering enabled
	table = gr.Dataframe(
	value=get_formatted_data(),
	label=RESOURCE_TITLE,
	show_label=False,
	interactive=False,
	wrap=False, # Disable wrapping to show full text in single lines
	datatype="markdown", # Enable HTML rendering
	)

	# Connect search functionality
	def search_and_format(query):
	initial_df = load_data()
	filtered_df = search_and_filter_data(initial_df, query)
	return format_dataframe_for_display(
	filtered_df,
	url_columns=[
	"github_url",
	"huggingface_url",
	"zenodo_url",
	"paper_url",
	"website_url",
	],
	hide_columns=["date_submitted"],
	)

	search_box.change(
	fn=search_and_format,
	inputs=search_box,
	outputs=table,
	)

	# Refresh button to reload data
	refresh_btn = gr.Button("🔄 Refresh Data", variant="secondary")
	refresh_btn.click(fn=get_formatted_data, outputs=table)

	return table


	def create_contribute_tab():
	"""Create the 'Contribute' tab for this resource type."""
	with gr.TabItem("➕ Contribute", id=f"{RESOURCE_TYPE}_contribute"):
	gr.Markdown(f"### Contribute a New {RESOURCE_TITLE[:-1]}")

	# Login section
	gr.Markdown("Please log in to contribute resources:")
	login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-oauth-button")

	gr.Markdown("Please fill in the information below to add a new dataset:")

	with gr.Column():
	# Required fields
	name_input = gr.Textbox(
	label="Name *",
	placeholder="Enter the name of the dataset",
	info="The name or title of the dataset (required)",
	)

	dataset_type_input = gr.Dropdown(
	label="Dataset Type *",
	choices=DATASET_TYPES,
	info="Type of dataset (required)",
	multiselect=False,
	)

	# URL fields (at least one required)
	gr.Markdown("URLs (at least one required)")
	with gr.Row():
	github_url_input = gr.Textbox(
	label="GitHub URL",
	placeholder="https://github.com/...",
	info="GitHub repository URL",
	)
	huggingface_url_input = gr.Textbox(
	label="Hugging Face URL",
	placeholder="https://huggingface.co/datasets/...",
	info="Hugging Face dataset URL",
	)

	zenodo_url_input = gr.Textbox(
	label="Zenodo URL",
	placeholder="https://zenodo.org/...",
	info="Zenodo repository URL",
	)

	# Optional fields
	gr.Markdown("Optional Information")
	paper_url_input = gr.Textbox(
	label="Paper URL",
	placeholder="https://...",
	info="Link to associated research paper",
	)

	website_url_input = gr.Textbox(
	label="Website URL",
	placeholder="https://...",
	info="Project or dataset website",
	)

	# Multi-select fields
	task_input = gr.CheckboxGroup(
	label="Tasks",
	choices=TASKS,
	info="What tasks is this dataset suitable for?",
	)

	domain_input = gr.CheckboxGroup(
	label="Domain",
	choices=DOMAINS,
	info="Specific domains covered by the dataset",
	)

	countries_input = gr.CheckboxGroup(
	label="Countries",
	choices=[
	"Spain",
	"Mexico",
	"Argentina",
	"Colombia",
	"Peru",
	"Venezuela",
	"Chile",
	"Ecuador",
	"Guatemala",
	"Cuba",
	"Bolivia",
	"Dominican Republic",
	"Honduras",
	"Paraguay",
	"El Salvador",
	"Nicaragua",
	"Costa Rica",
	"Panama",
	"Uruguay",
	"Puerto Rico",
	"Brazil",
	"Portugal",
	],
	info="Countries where Spanish or Portuguese are spoken",
	)

	languages_input = gr.CheckboxGroup(
	label="Languages",
	choices=[
	"spanish",
	"portuguese",
	"basque",
	"catalan",
	"galician",
	"guarani",
	"quechua",
	],
	info="Languages included in the dataset",
	)

	submit_btn = gr.Button(f"Submit {RESOURCE_TITLE[:-1]}", variant="primary")
	result_msg = gr.Markdown()

	# Submit function
	submit_btn.click(
	fn=submit_resource,
	inputs=[
	name_input,
	github_url_input,
	huggingface_url_input,
	zenodo_url_input,
	paper_url_input,
	dataset_type_input,
	task_input,
	domain_input,
	website_url_input,
	countries_input,
	languages_input,
	],
	outputs=[result_msg],
	)

	return (
	name_input,
	github_url_input,
	huggingface_url_input,
	zenodo_url_input,
	paper_url_input,
	dataset_type_input,
	task_input,
	domain_input,
	website_url_input,
	countries_input,
	languages_input,
	submit_btn,
	result_msg,
	)


	def search_entries(query: str) -> pd.DataFrame:
	"""Search for entries by name or URL."""
	if not query.strip():
	return pd.DataFrame()

	df = load_data()
	if df.empty:
	return df

	# Search in name, github_url, huggingface_url, and zenodo_url columns
	mask = (
	df["name"].str.contains(query, case=False, na=False)
	\| df["github_url"].str.contains(query, case=False, na=False)
	\| df["huggingface_url"].str.contains(query, case=False, na=False)
	\| df["zenodo_url"].str.contains(query, case=False, na=False)
	)

	return df[mask]


	def load_entry_for_edit(selected_entry: str) -> tuple:
	"""Load a specific entry for editing."""
	if not selected_entry:
	return ("",) * 11 # Return empty values for all fields

	df = load_data()
	if df.empty:
	return ("",) * 11

	# Find the entry by name
	entry = df[df["name"] == selected_entry].iloc[0]

	# Convert comma-separated strings back to lists for multi-select components
	task_list = [t.strip() for t in entry["task"].split(",")] if entry["task"] else []
	domain_list = (
	[d.strip() for d in entry["domain"].split(",")] if entry["domain"] else []
	)
	countries_list = (
	[c.strip() for c in entry["countries"].split(",")] if entry["countries"] else []
	)
	languages_list = (
	[l.strip() for l in entry["languages"].split(",")] if entry["languages"] else []
	)

	return (
	entry["name"],
	entry["github_url"],
	entry["huggingface_url"],
	entry["zenodo_url"],
	entry["paper_url"],
	entry["dataset_type"],
	task_list,
	domain_list,
	entry["website_url"],
	countries_list,
	languages_list,
	)


	def update_entry(
	original_name: str,
	name: str,
	github_url: str,
	huggingface_url: str,
	zenodo_url: str,
	paper_url: str,
	dataset_type: str,
	task: list,
	domain: list,
	website_url: str,
	countries: list,
	languages: list,
	profile: gr.OAuthProfile \| None,
	):
	"""Update an existing entry."""
	# Login required
	if profile is None:
	return "❌ Error: You need to be logged in to edit entries."

	username = profile.username
	if not username:
	return "❌ Could not get username from profile."

	if not original_name:
	return "❌ No entry selected to edit."

	if not name.strip():
	return "❌ Name is required."

	# Validate that at least one URL is provided
	urls = [github_url.strip(), huggingface_url.strip(), zenodo_url.strip()]
	if not any(urls):
	return "❌ At least one URL (GitHub, Hugging Face, or Zenodo) is required."

	# Validate URLs
	for url_field, url_value in [
	("GitHub URL", github_url),
	("Hugging Face URL", huggingface_url),
	("Zenodo URL", zenodo_url),
	("Paper URL", paper_url),
	("Website URL", website_url),
	]:
	if url_value.strip() and not validate_url(url_value):
	return f"❌ Invalid {url_field}. Please provide a valid URL."

	try:
	# Load existing dataset
	existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
	existing_df = existing_dataset.to_pandas()

	# Find and update the entry
	mask = existing_df["name"] == original_name
	if not mask.any():
	return f"❌ Entry '{original_name}' not found."

	# Update the entry
	current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

	existing_df.loc[mask, "name"] = name
	existing_df.loc[mask, "github_url"] = github_url
	existing_df.loc[mask, "huggingface_url"] = huggingface_url
	existing_df.loc[mask, "zenodo_url"] = zenodo_url
	existing_df.loc[mask, "paper_url"] = paper_url
	existing_df.loc[mask, "dataset_type"] = dataset_type
	existing_df.loc[mask, "task"] = ", ".join(task) if task else ""
	existing_df.loc[mask, "domain"] = ", ".join(domain) if domain else ""
	existing_df.loc[mask, "website_url"] = website_url
	existing_df.loc[mask, "countries"] = ", ".join(countries) if countries else ""
	existing_df.loc[mask, "languages"] = ", ".join(languages) if languages else ""
	existing_df.loc[mask, "date_submitted"] = current_time

	# Convert back to Dataset and push to hub
	updated_dataset = Dataset.from_pandas(existing_df)
	updated_dataset.push_to_hub(
	DATASET_NAME,
	config_name=CONFIG_NAME,
	commit_message=f"Update dataset entry: {name} (edited by {username})",
	)

	return f"✅ Successfully updated '{name}'!"

	except Exception as e:
	return f"❌ Error updating entry: {str(e)}"


	def create_edit_tab():
	"""Create the edit tab for modifying existing entries."""
	with gr.TabItem("✏️ Edit", id=f"{RESOURCE_TYPE}_edit"):
	gr.Markdown(f"### Edit Existing {RESOURCE_TITLE}")
	gr.Markdown("Please log in to edit entries:")
	login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-edit-oauth-button")

	gr.Markdown("Search for an entry to edit:")

	with gr.Row():
	search_input = gr.Textbox(
	label="Search by name or URL",
	placeholder="Enter dataset name or URL to search...",
	scale=3,
	)
	search_btn = gr.Button("🔍 Search", scale=1)

	search_results = gr.Dropdown(
	label="Select entry to edit", choices=[], interactive=True
	)

	gr.Markdown("---")
	gr.Markdown("Edit the selected entry:")

	with gr.Column(visible=False) as edit_form:
	name_input = gr.Textbox(label="Name *", placeholder="Dataset name")
	dataset_type_input = gr.Dropdown(
	label="Dataset Type *",
	choices=DATASET_TYPES,
	value="benchmark",
	)

	gr.Markdown("URLs (at least one required)")
	with gr.Row():
	github_url_input = gr.Textbox(
	label="GitHub URL", placeholder="https://github.com/..."
	)
	huggingface_url_input = gr.Textbox(
	label="Hugging Face URL",
	placeholder="https://huggingface.co/datasets/...",
	)
	zenodo_url_input = gr.Textbox(
	label="Zenodo URL", placeholder="https://zenodo.org/..."
	)

	gr.Markdown("Optional Information")
	paper_url_input = gr.Textbox(
	label="Paper URL", placeholder="https://arxiv.org/..."
	)
	website_url_input = gr.Textbox(
	label="Website URL", placeholder="https://..."
	)

	task_input = gr.CheckboxGroup(
	label="Tasks",
	choices=TASKS,
	)

	domain_input = gr.CheckboxGroup(
	label="Domain",
	choices=DOMAINS,
	)

	countries_input = gr.CheckboxGroup(
	label="Countries",
	choices=COUNTRIES,
	)

	languages_input = gr.CheckboxGroup(
	label="Languages",
	choices=LANGUAGES,
	)

	update_btn = gr.Button("💾 Update Entry", variant="primary")
	result_msg = gr.Markdown()

	# Store the original name for updating
	original_name_state = gr.State("")

	def search_and_update_dropdown(query):
	results_df = search_entries(query)
	if results_df.empty:
	return gr.Dropdown(choices=[], value=None)
	else:
	choices = results_df["name"].tolist()
	return gr.Dropdown(choices=choices, value=None)

	def load_entry_and_show_form(selected_entry):
	if not selected_entry:
	return (gr.Column(visible=False), "", [("",) 11])

	entry_data = load_entry_for_edit(selected_entry)
	return (gr.Column(visible=True), selected_entry, *entry_data)

	# Event handlers
	search_btn.click(
	fn=search_and_update_dropdown,
	inputs=[search_input],
	outputs=[search_results],
	)

	search_results.change(
	fn=load_entry_and_show_form,
	inputs=[search_results],
	outputs=[
	edit_form,
	original_name_state,
	name_input,
	github_url_input,
	huggingface_url_input,
	zenodo_url_input,
	paper_url_input,
	dataset_type_input,
	task_input,
	domain_input,
	website_url_input,
	countries_input,
	languages_input,
	],
	)

	update_btn.click(
	fn=update_entry,
	inputs=[
	original_name_state,
	name_input,
	github_url_input,
	huggingface_url_input,
	zenodo_url_input,
	paper_url_input,
	dataset_type_input,
	task_input,
	domain_input,
	website_url_input,
	countries_input,
	languages_input,
	],
	outputs=[result_msg],
	)

	return (
	search_input,
	search_btn,
	search_results,
	edit_form,
	name_input,
	dataset_type_input,
	github_url_input,
	huggingface_url_input,
	zenodo_url_input,
	paper_url_input,
	website_url_input,
	task_input,
	domain_input,
	countries_input,
	languages_input,
	update_btn,
	result_msg,
	)


	def create_tab():
	"""Create the complete tab for this resource type."""
	with gr.TabItem(f"📊 {RESOURCE_TITLE}", id=RESOURCE_TYPE):
	with gr.Tabs():
	table = create_all_tab()
	inputs = create_contribute_tab()
	edit_components = create_edit_tab()
	return table, inputs, edit_components