recursos-pln-es / datasets_resource.py
mariagrandury's picture
fix oauth config
1af5fea
from datetime import datetime, timezone
import gradio as gr
import pandas as pd
from datasets import Dataset, load_dataset
from constants import (
COUNTRIES,
DATASET_TYPES,
DOMAINS,
LANGUAGES,
TASKS,
format_dataframe_for_display,
format_dataframe_for_html_display,
)
# Dataset configuration
DATASET_NAME = "somosnlp/recursos-pln-es"
CONFIG_NAME = "datasets"
RESOURCE_TYPE = "datasets"
RESOURCE_TITLE = "Datasets"
def load_data() -> pd.DataFrame:
"""Load data from HuggingFace dataset or return empty DataFrame."""
try:
dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
return dataset.to_pandas()
except Exception as e:
print(f"Could not load {RESOURCE_TYPE} dataset: {e}")
# Return empty DataFrame with required columns
return pd.DataFrame(
columns=[
"name",
"github_url",
"huggingface_url",
"zenodo_url",
"paper_url",
"dataset_type",
"task",
"domain",
"website_url",
"countries",
"languages",
"submitted_by",
"date_submitted",
]
)
def search_and_filter_data(df: pd.DataFrame, search_query: str) -> pd.DataFrame:
"""Filter dataframe based on search query."""
if search_query == "":
return df
else:
filtered_df = df[
df.apply(
lambda row: row.astype(str)
.str.contains(search_query, case=False)
.any(),
axis=1,
)
]
return filtered_df
def validate_url(url: str) -> bool:
"""Validate if a string is a valid URL."""
if not url:
return True # Empty URLs are allowed for optional fields
return url.startswith(("http://", "https://"))
def submit_resource(
name: str,
github_url: str,
huggingface_url: str,
zenodo_url: str,
paper_url: str,
dataset_type: str,
task: list,
domain: list,
website_url: str,
countries: list,
languages: list,
profile: gr.OAuthProfile | None,
):
"""Submit a new resource to the corresponding dataset."""
# Login required
if profile is None:
return "❌ Error: You need to be logged in to submit a resource."
# Validate required fields
if not name:
return "❌ Error: Name is required."
if not dataset_type:
return "❌ Error: Dataset type is required."
# Validate that at least one URL is provided
if not any([github_url, huggingface_url, zenodo_url]):
return "❌ Error: At least one of GitHub URL, Hugging Face URL, or Zenodo URL must be provided."
# Validate URLs
urls_to_check = [
("GitHub URL", github_url),
("Hugging Face URL", huggingface_url),
("Zenodo URL", zenodo_url),
("Paper URL", paper_url),
("Website URL", website_url),
]
for url_name, url_value in urls_to_check:
if url_value and not validate_url(url_value):
return f"❌ Error: {url_name} must be a valid URL starting with http:// or https://"
try:
username = profile.username
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Create new row data
new_data = {
"name": name,
"github_url": github_url,
"huggingface_url": huggingface_url,
"zenodo_url": zenodo_url,
"paper_url": paper_url,
"dataset_type": dataset_type,
"task": ", ".join(task) if task else "",
"domain": ", ".join(domain) if domain else "",
"website_url": website_url,
"countries": ", ".join(countries) if countries else "",
"languages": ", ".join(languages) if languages else "",
"submitted_by": username,
"date_submitted": current_time,
}
# Try to load existing dataset, or create new one
try:
existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
existing_df = existing_dataset.to_pandas()
# Add new row
updated_df = pd.concat(
[existing_df, pd.DataFrame([new_data])], ignore_index=True
)
except:
# Create new dataset if it doesn't exist
updated_df = pd.DataFrame([new_data])
# Convert back to Dataset and push to hub
updated_dataset = Dataset.from_pandas(updated_df)
updated_dataset.push_to_hub(
DATASET_NAME,
config_name=CONFIG_NAME,
commit_message=f"Add {name} by {username}",
token=True, # Use the user's token
)
return f"βœ… Success: {name} has been submitted successfully!"
except Exception as e:
return f"❌ Error: Failed to submit resource. {str(e)}"
def create_all_tab():
"""Create the 'All' tab for this resource type."""
with gr.TabItem("πŸ“‹ All", id=f"{RESOURCE_TYPE}_all"):
gr.Markdown(f"### All {RESOURCE_TITLE}")
search_box = gr.Textbox(
placeholder=f"Search {RESOURCE_TYPE}...",
label="Filter the table",
show_label=False,
)
# Load and format initial data with clickable links
def get_formatted_data():
df = load_data()
return format_dataframe_for_display(
df,
url_columns=[
"github_url",
"huggingface_url",
"zenodo_url",
"paper_url",
"website_url",
],
hide_columns=["date_submitted"],
)
# Use Dataframe component with HTML rendering enabled
table = gr.Dataframe(
value=get_formatted_data(),
label=RESOURCE_TITLE,
show_label=False,
interactive=False,
wrap=False, # Disable wrapping to show full text in single lines
datatype="markdown", # Enable HTML rendering
)
# Connect search functionality
def search_and_format(query):
initial_df = load_data()
filtered_df = search_and_filter_data(initial_df, query)
return format_dataframe_for_display(
filtered_df,
url_columns=[
"github_url",
"huggingface_url",
"zenodo_url",
"paper_url",
"website_url",
],
hide_columns=["date_submitted"],
)
search_box.change(
fn=search_and_format,
inputs=search_box,
outputs=table,
)
# Refresh button to reload data
refresh_btn = gr.Button("πŸ”„ Refresh Data", variant="secondary")
refresh_btn.click(fn=get_formatted_data, outputs=table)
return table
def create_contribute_tab():
"""Create the 'Contribute' tab for this resource type."""
with gr.TabItem("βž• Contribute", id=f"{RESOURCE_TYPE}_contribute"):
gr.Markdown(f"### Contribute a New {RESOURCE_TITLE[:-1]}")
# Login section
gr.Markdown("Please log in to contribute resources:")
login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-oauth-button")
gr.Markdown("Please fill in the information below to add a new dataset:")
with gr.Column():
# Required fields
name_input = gr.Textbox(
label="Name *",
placeholder="Enter the name of the dataset",
info="The name or title of the dataset (required)",
)
dataset_type_input = gr.Dropdown(
label="Dataset Type *",
choices=DATASET_TYPES,
info="Type of dataset (required)",
multiselect=False,
)
# URL fields (at least one required)
gr.Markdown("**URLs** (at least one required)")
with gr.Row():
github_url_input = gr.Textbox(
label="GitHub URL",
placeholder="https://github.com/...",
info="GitHub repository URL",
)
huggingface_url_input = gr.Textbox(
label="Hugging Face URL",
placeholder="https://huggingface.co/datasets/...",
info="Hugging Face dataset URL",
)
zenodo_url_input = gr.Textbox(
label="Zenodo URL",
placeholder="https://zenodo.org/...",
info="Zenodo repository URL",
)
# Optional fields
gr.Markdown("**Optional Information**")
paper_url_input = gr.Textbox(
label="Paper URL",
placeholder="https://...",
info="Link to associated research paper",
)
website_url_input = gr.Textbox(
label="Website URL",
placeholder="https://...",
info="Project or dataset website",
)
# Multi-select fields
task_input = gr.CheckboxGroup(
label="Tasks",
choices=TASKS,
info="What tasks is this dataset suitable for?",
)
domain_input = gr.CheckboxGroup(
label="Domain",
choices=DOMAINS,
info="Specific domains covered by the dataset",
)
countries_input = gr.CheckboxGroup(
label="Countries",
choices=[
"Spain",
"Mexico",
"Argentina",
"Colombia",
"Peru",
"Venezuela",
"Chile",
"Ecuador",
"Guatemala",
"Cuba",
"Bolivia",
"Dominican Republic",
"Honduras",
"Paraguay",
"El Salvador",
"Nicaragua",
"Costa Rica",
"Panama",
"Uruguay",
"Puerto Rico",
"Brazil",
"Portugal",
],
info="Countries where Spanish or Portuguese are spoken",
)
languages_input = gr.CheckboxGroup(
label="Languages",
choices=[
"spanish",
"portuguese",
"basque",
"catalan",
"galician",
"guarani",
"quechua",
],
info="Languages included in the dataset",
)
submit_btn = gr.Button(f"Submit {RESOURCE_TITLE[:-1]}", variant="primary")
result_msg = gr.Markdown()
# Submit function
submit_btn.click(
fn=submit_resource,
inputs=[
name_input,
github_url_input,
huggingface_url_input,
zenodo_url_input,
paper_url_input,
dataset_type_input,
task_input,
domain_input,
website_url_input,
countries_input,
languages_input,
],
outputs=[result_msg],
)
return (
name_input,
github_url_input,
huggingface_url_input,
zenodo_url_input,
paper_url_input,
dataset_type_input,
task_input,
domain_input,
website_url_input,
countries_input,
languages_input,
submit_btn,
result_msg,
)
def search_entries(query: str) -> pd.DataFrame:
"""Search for entries by name or URL."""
if not query.strip():
return pd.DataFrame()
df = load_data()
if df.empty:
return df
# Search in name, github_url, huggingface_url, and zenodo_url columns
mask = (
df["name"].str.contains(query, case=False, na=False)
| df["github_url"].str.contains(query, case=False, na=False)
| df["huggingface_url"].str.contains(query, case=False, na=False)
| df["zenodo_url"].str.contains(query, case=False, na=False)
)
return df[mask]
def load_entry_for_edit(selected_entry: str) -> tuple:
"""Load a specific entry for editing."""
if not selected_entry:
return ("",) * 11 # Return empty values for all fields
df = load_data()
if df.empty:
return ("",) * 11
# Find the entry by name
entry = df[df["name"] == selected_entry].iloc[0]
# Convert comma-separated strings back to lists for multi-select components
task_list = [t.strip() for t in entry["task"].split(",")] if entry["task"] else []
domain_list = (
[d.strip() for d in entry["domain"].split(",")] if entry["domain"] else []
)
countries_list = (
[c.strip() for c in entry["countries"].split(",")] if entry["countries"] else []
)
languages_list = (
[l.strip() for l in entry["languages"].split(",")] if entry["languages"] else []
)
return (
entry["name"],
entry["github_url"],
entry["huggingface_url"],
entry["zenodo_url"],
entry["paper_url"],
entry["dataset_type"],
task_list,
domain_list,
entry["website_url"],
countries_list,
languages_list,
)
def update_entry(
original_name: str,
name: str,
github_url: str,
huggingface_url: str,
zenodo_url: str,
paper_url: str,
dataset_type: str,
task: list,
domain: list,
website_url: str,
countries: list,
languages: list,
profile: gr.OAuthProfile | None,
):
"""Update an existing entry."""
# Login required
if profile is None:
return "❌ Error: You need to be logged in to edit entries."
username = profile.username
if not username:
return "❌ Could not get username from profile."
if not original_name:
return "❌ No entry selected to edit."
if not name.strip():
return "❌ Name is required."
# Validate that at least one URL is provided
urls = [github_url.strip(), huggingface_url.strip(), zenodo_url.strip()]
if not any(urls):
return "❌ At least one URL (GitHub, Hugging Face, or Zenodo) is required."
# Validate URLs
for url_field, url_value in [
("GitHub URL", github_url),
("Hugging Face URL", huggingface_url),
("Zenodo URL", zenodo_url),
("Paper URL", paper_url),
("Website URL", website_url),
]:
if url_value.strip() and not validate_url(url_value):
return f"❌ Invalid {url_field}. Please provide a valid URL."
try:
# Load existing dataset
existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
existing_df = existing_dataset.to_pandas()
# Find and update the entry
mask = existing_df["name"] == original_name
if not mask.any():
return f"❌ Entry '{original_name}' not found."
# Update the entry
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
existing_df.loc[mask, "name"] = name
existing_df.loc[mask, "github_url"] = github_url
existing_df.loc[mask, "huggingface_url"] = huggingface_url
existing_df.loc[mask, "zenodo_url"] = zenodo_url
existing_df.loc[mask, "paper_url"] = paper_url
existing_df.loc[mask, "dataset_type"] = dataset_type
existing_df.loc[mask, "task"] = ", ".join(task) if task else ""
existing_df.loc[mask, "domain"] = ", ".join(domain) if domain else ""
existing_df.loc[mask, "website_url"] = website_url
existing_df.loc[mask, "countries"] = ", ".join(countries) if countries else ""
existing_df.loc[mask, "languages"] = ", ".join(languages) if languages else ""
existing_df.loc[mask, "date_submitted"] = current_time
# Convert back to Dataset and push to hub
updated_dataset = Dataset.from_pandas(existing_df)
updated_dataset.push_to_hub(
DATASET_NAME,
config_name=CONFIG_NAME,
commit_message=f"Update dataset entry: {name} (edited by {username})",
)
return f"βœ… Successfully updated '{name}'!"
except Exception as e:
return f"❌ Error updating entry: {str(e)}"
def create_edit_tab():
"""Create the edit tab for modifying existing entries."""
with gr.TabItem("✏️ Edit", id=f"{RESOURCE_TYPE}_edit"):
gr.Markdown(f"### Edit Existing {RESOURCE_TITLE}")
gr.Markdown("Please log in to edit entries:")
login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-edit-oauth-button")
gr.Markdown("Search for an entry to edit:")
with gr.Row():
search_input = gr.Textbox(
label="Search by name or URL",
placeholder="Enter dataset name or URL to search...",
scale=3,
)
search_btn = gr.Button("πŸ” Search", scale=1)
search_results = gr.Dropdown(
label="Select entry to edit", choices=[], interactive=True
)
gr.Markdown("---")
gr.Markdown("**Edit the selected entry:**")
with gr.Column(visible=False) as edit_form:
name_input = gr.Textbox(label="Name *", placeholder="Dataset name")
dataset_type_input = gr.Dropdown(
label="Dataset Type *",
choices=DATASET_TYPES,
value="benchmark",
)
gr.Markdown("**URLs** (at least one required)")
with gr.Row():
github_url_input = gr.Textbox(
label="GitHub URL", placeholder="https://github.com/..."
)
huggingface_url_input = gr.Textbox(
label="Hugging Face URL",
placeholder="https://huggingface.co/datasets/...",
)
zenodo_url_input = gr.Textbox(
label="Zenodo URL", placeholder="https://zenodo.org/..."
)
gr.Markdown("**Optional Information**")
paper_url_input = gr.Textbox(
label="Paper URL", placeholder="https://arxiv.org/..."
)
website_url_input = gr.Textbox(
label="Website URL", placeholder="https://..."
)
task_input = gr.CheckboxGroup(
label="Tasks",
choices=TASKS,
)
domain_input = gr.CheckboxGroup(
label="Domain",
choices=DOMAINS,
)
countries_input = gr.CheckboxGroup(
label="Countries",
choices=COUNTRIES,
)
languages_input = gr.CheckboxGroup(
label="Languages",
choices=LANGUAGES,
)
update_btn = gr.Button("πŸ’Ύ Update Entry", variant="primary")
result_msg = gr.Markdown()
# Store the original name for updating
original_name_state = gr.State("")
def search_and_update_dropdown(query):
results_df = search_entries(query)
if results_df.empty:
return gr.Dropdown(choices=[], value=None)
else:
choices = results_df["name"].tolist()
return gr.Dropdown(choices=choices, value=None)
def load_entry_and_show_form(selected_entry):
if not selected_entry:
return (gr.Column(visible=False), "", *[("",) * 11])
entry_data = load_entry_for_edit(selected_entry)
return (gr.Column(visible=True), selected_entry, *entry_data)
# Event handlers
search_btn.click(
fn=search_and_update_dropdown,
inputs=[search_input],
outputs=[search_results],
)
search_results.change(
fn=load_entry_and_show_form,
inputs=[search_results],
outputs=[
edit_form,
original_name_state,
name_input,
github_url_input,
huggingface_url_input,
zenodo_url_input,
paper_url_input,
dataset_type_input,
task_input,
domain_input,
website_url_input,
countries_input,
languages_input,
],
)
update_btn.click(
fn=update_entry,
inputs=[
original_name_state,
name_input,
github_url_input,
huggingface_url_input,
zenodo_url_input,
paper_url_input,
dataset_type_input,
task_input,
domain_input,
website_url_input,
countries_input,
languages_input,
],
outputs=[result_msg],
)
return (
search_input,
search_btn,
search_results,
edit_form,
name_input,
dataset_type_input,
github_url_input,
huggingface_url_input,
zenodo_url_input,
paper_url_input,
website_url_input,
task_input,
domain_input,
countries_input,
languages_input,
update_btn,
result_msg,
)
def create_tab():
"""Create the complete tab for this resource type."""
with gr.TabItem(f"πŸ“Š {RESOURCE_TITLE}", id=RESOURCE_TYPE):
with gr.Tabs():
table = create_all_tab()
inputs = create_contribute_tab()
edit_components = create_edit_tab()
return table, inputs, edit_components