from datetime import datetime, timezone import gradio as gr import pandas as pd from datasets import Dataset, load_dataset from constants import ( COUNTRIES, DATASET_TYPES, DOMAINS, LANGUAGES, TASKS, format_dataframe_for_display, format_dataframe_for_html_display, ) # Dataset configuration DATASET_NAME = "somosnlp/recursos-pln-es" CONFIG_NAME = "datasets" RESOURCE_TYPE = "datasets" RESOURCE_TITLE = "Datasets" def load_data() -> pd.DataFrame: """Load data from HuggingFace dataset or return empty DataFrame.""" try: dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") return dataset.to_pandas() except Exception as e: print(f"Could not load {RESOURCE_TYPE} dataset: {e}") # Return empty DataFrame with required columns return pd.DataFrame( columns=[ "name", "github_url", "huggingface_url", "zenodo_url", "paper_url", "dataset_type", "task", "domain", "website_url", "countries", "languages", "submitted_by", "date_submitted", ] ) def search_and_filter_data(df: pd.DataFrame, search_query: str) -> pd.DataFrame: """Filter dataframe based on search query.""" if search_query == "": return df else: filtered_df = df[ df.apply( lambda row: row.astype(str) .str.contains(search_query, case=False) .any(), axis=1, ) ] return filtered_df def validate_url(url: str) -> bool: """Validate if a string is a valid URL.""" if not url: return True # Empty URLs are allowed for optional fields return url.startswith(("http://", "https://")) def submit_resource( name: str, github_url: str, huggingface_url: str, zenodo_url: str, paper_url: str, dataset_type: str, task: list, domain: list, website_url: str, countries: list, languages: list, profile: gr.OAuthProfile | None, ): """Submit a new resource to the corresponding dataset.""" # Login required if profile is None: return "❌ Error: You need to be logged in to submit a resource." # Validate required fields if not name: return "❌ Error: Name is required." if not dataset_type: return "❌ Error: Dataset type is required." # Validate that at least one URL is provided if not any([github_url, huggingface_url, zenodo_url]): return "❌ Error: At least one of GitHub URL, Hugging Face URL, or Zenodo URL must be provided." # Validate URLs urls_to_check = [ ("GitHub URL", github_url), ("Hugging Face URL", huggingface_url), ("Zenodo URL", zenodo_url), ("Paper URL", paper_url), ("Website URL", website_url), ] for url_name, url_value in urls_to_check: if url_value and not validate_url(url_value): return f"❌ Error: {url_name} must be a valid URL starting with http:// or https://" try: username = profile.username current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") # Create new row data new_data = { "name": name, "github_url": github_url, "huggingface_url": huggingface_url, "zenodo_url": zenodo_url, "paper_url": paper_url, "dataset_type": dataset_type, "task": ", ".join(task) if task else "", "domain": ", ".join(domain) if domain else "", "website_url": website_url, "countries": ", ".join(countries) if countries else "", "languages": ", ".join(languages) if languages else "", "submitted_by": username, "date_submitted": current_time, } # Try to load existing dataset, or create new one try: existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") existing_df = existing_dataset.to_pandas() # Add new row updated_df = pd.concat( [existing_df, pd.DataFrame([new_data])], ignore_index=True ) except: # Create new dataset if it doesn't exist updated_df = pd.DataFrame([new_data]) # Convert back to Dataset and push to hub updated_dataset = Dataset.from_pandas(updated_df) updated_dataset.push_to_hub( DATASET_NAME, config_name=CONFIG_NAME, commit_message=f"Add {name} by {username}", token=True, # Use the user's token ) return f"✅ Success: {name} has been submitted successfully!" except Exception as e: return f"❌ Error: Failed to submit resource. {str(e)}" def create_all_tab(): """Create the 'All' tab for this resource type.""" with gr.TabItem("📋 All", id=f"{RESOURCE_TYPE}_all"): gr.Markdown(f"### All {RESOURCE_TITLE}") search_box = gr.Textbox( placeholder=f"Search {RESOURCE_TYPE}...", label="Filter the table", show_label=False, ) # Load and format initial data with clickable links def get_formatted_data(): df = load_data() return format_dataframe_for_display( df, url_columns=[ "github_url", "huggingface_url", "zenodo_url", "paper_url", "website_url", ], hide_columns=["date_submitted"], ) # Use Dataframe component with HTML rendering enabled table = gr.Dataframe( value=get_formatted_data(), label=RESOURCE_TITLE, show_label=False, interactive=False, wrap=False, # Disable wrapping to show full text in single lines datatype="markdown", # Enable HTML rendering ) # Connect search functionality def search_and_format(query): initial_df = load_data() filtered_df = search_and_filter_data(initial_df, query) return format_dataframe_for_display( filtered_df, url_columns=[ "github_url", "huggingface_url", "zenodo_url", "paper_url", "website_url", ], hide_columns=["date_submitted"], ) search_box.change( fn=search_and_format, inputs=search_box, outputs=table, ) # Refresh button to reload data refresh_btn = gr.Button("🔄 Refresh Data", variant="secondary") refresh_btn.click(fn=get_formatted_data, outputs=table) return table def create_contribute_tab(): """Create the 'Contribute' tab for this resource type.""" with gr.TabItem("➕ Contribute", id=f"{RESOURCE_TYPE}_contribute"): gr.Markdown(f"### Contribute a New {RESOURCE_TITLE[:-1]}") # Login section gr.Markdown("Please log in to contribute resources:") login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-oauth-button") gr.Markdown("Please fill in the information below to add a new dataset:") with gr.Column(): # Required fields name_input = gr.Textbox( label="Name *", placeholder="Enter the name of the dataset", info="The name or title of the dataset (required)", ) dataset_type_input = gr.Dropdown( label="Dataset Type *", choices=DATASET_TYPES, info="Type of dataset (required)", multiselect=False, ) # URL fields (at least one required) gr.Markdown("**URLs** (at least one required)") with gr.Row(): github_url_input = gr.Textbox( label="GitHub URL", placeholder="https://github.com/...", info="GitHub repository URL", ) huggingface_url_input = gr.Textbox( label="Hugging Face URL", placeholder="https://huggingface.co/datasets/...", info="Hugging Face dataset URL", ) zenodo_url_input = gr.Textbox( label="Zenodo URL", placeholder="https://zenodo.org/...", info="Zenodo repository URL", ) # Optional fields gr.Markdown("**Optional Information**") paper_url_input = gr.Textbox( label="Paper URL", placeholder="https://...", info="Link to associated research paper", ) website_url_input = gr.Textbox( label="Website URL", placeholder="https://...", info="Project or dataset website", ) # Multi-select fields task_input = gr.CheckboxGroup( label="Tasks", choices=TASKS, info="What tasks is this dataset suitable for?", ) domain_input = gr.CheckboxGroup( label="Domain", choices=DOMAINS, info="Specific domains covered by the dataset", ) countries_input = gr.CheckboxGroup( label="Countries", choices=[ "Spain", "Mexico", "Argentina", "Colombia", "Peru", "Venezuela", "Chile", "Ecuador", "Guatemala", "Cuba", "Bolivia", "Dominican Republic", "Honduras", "Paraguay", "El Salvador", "Nicaragua", "Costa Rica", "Panama", "Uruguay", "Puerto Rico", "Brazil", "Portugal", ], info="Countries where Spanish or Portuguese are spoken", ) languages_input = gr.CheckboxGroup( label="Languages", choices=[ "spanish", "portuguese", "basque", "catalan", "galician", "guarani", "quechua", ], info="Languages included in the dataset", ) submit_btn = gr.Button(f"Submit {RESOURCE_TITLE[:-1]}", variant="primary") result_msg = gr.Markdown() # Submit function submit_btn.click( fn=submit_resource, inputs=[ name_input, github_url_input, huggingface_url_input, zenodo_url_input, paper_url_input, dataset_type_input, task_input, domain_input, website_url_input, countries_input, languages_input, ], outputs=[result_msg], ) return ( name_input, github_url_input, huggingface_url_input, zenodo_url_input, paper_url_input, dataset_type_input, task_input, domain_input, website_url_input, countries_input, languages_input, submit_btn, result_msg, ) def search_entries(query: str) -> pd.DataFrame: """Search for entries by name or URL.""" if not query.strip(): return pd.DataFrame() df = load_data() if df.empty: return df # Search in name, github_url, huggingface_url, and zenodo_url columns mask = ( df["name"].str.contains(query, case=False, na=False) | df["github_url"].str.contains(query, case=False, na=False) | df["huggingface_url"].str.contains(query, case=False, na=False) | df["zenodo_url"].str.contains(query, case=False, na=False) ) return df[mask] def load_entry_for_edit(selected_entry: str) -> tuple: """Load a specific entry for editing.""" if not selected_entry: return ("",) * 11 # Return empty values for all fields df = load_data() if df.empty: return ("",) * 11 # Find the entry by name entry = df[df["name"] == selected_entry].iloc[0] # Convert comma-separated strings back to lists for multi-select components task_list = [t.strip() for t in entry["task"].split(",")] if entry["task"] else [] domain_list = ( [d.strip() for d in entry["domain"].split(",")] if entry["domain"] else [] ) countries_list = ( [c.strip() for c in entry["countries"].split(",")] if entry["countries"] else [] ) languages_list = ( [l.strip() for l in entry["languages"].split(",")] if entry["languages"] else [] ) return ( entry["name"], entry["github_url"], entry["huggingface_url"], entry["zenodo_url"], entry["paper_url"], entry["dataset_type"], task_list, domain_list, entry["website_url"], countries_list, languages_list, ) def update_entry( original_name: str, name: str, github_url: str, huggingface_url: str, zenodo_url: str, paper_url: str, dataset_type: str, task: list, domain: list, website_url: str, countries: list, languages: list, profile: gr.OAuthProfile | None, ): """Update an existing entry.""" # Login required if profile is None: return "❌ Error: You need to be logged in to edit entries." username = profile.username if not username: return "❌ Could not get username from profile." if not original_name: return "❌ No entry selected to edit." if not name.strip(): return "❌ Name is required." # Validate that at least one URL is provided urls = [github_url.strip(), huggingface_url.strip(), zenodo_url.strip()] if not any(urls): return "❌ At least one URL (GitHub, Hugging Face, or Zenodo) is required." # Validate URLs for url_field, url_value in [ ("GitHub URL", github_url), ("Hugging Face URL", huggingface_url), ("Zenodo URL", zenodo_url), ("Paper URL", paper_url), ("Website URL", website_url), ]: if url_value.strip() and not validate_url(url_value): return f"❌ Invalid {url_field}. Please provide a valid URL." try: # Load existing dataset existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") existing_df = existing_dataset.to_pandas() # Find and update the entry mask = existing_df["name"] == original_name if not mask.any(): return f"❌ Entry '{original_name}' not found." # Update the entry current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") existing_df.loc[mask, "name"] = name existing_df.loc[mask, "github_url"] = github_url existing_df.loc[mask, "huggingface_url"] = huggingface_url existing_df.loc[mask, "zenodo_url"] = zenodo_url existing_df.loc[mask, "paper_url"] = paper_url existing_df.loc[mask, "dataset_type"] = dataset_type existing_df.loc[mask, "task"] = ", ".join(task) if task else "" existing_df.loc[mask, "domain"] = ", ".join(domain) if domain else "" existing_df.loc[mask, "website_url"] = website_url existing_df.loc[mask, "countries"] = ", ".join(countries) if countries else "" existing_df.loc[mask, "languages"] = ", ".join(languages) if languages else "" existing_df.loc[mask, "date_submitted"] = current_time # Convert back to Dataset and push to hub updated_dataset = Dataset.from_pandas(existing_df) updated_dataset.push_to_hub( DATASET_NAME, config_name=CONFIG_NAME, commit_message=f"Update dataset entry: {name} (edited by {username})", ) return f"✅ Successfully updated '{name}'!" except Exception as e: return f"❌ Error updating entry: {str(e)}" def create_edit_tab(): """Create the edit tab for modifying existing entries.""" with gr.TabItem("✏️ Edit", id=f"{RESOURCE_TYPE}_edit"): gr.Markdown(f"### Edit Existing {RESOURCE_TITLE}") gr.Markdown("Please log in to edit entries:") login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-edit-oauth-button") gr.Markdown("Search for an entry to edit:") with gr.Row(): search_input = gr.Textbox( label="Search by name or URL", placeholder="Enter dataset name or URL to search...", scale=3, ) search_btn = gr.Button("🔍 Search", scale=1) search_results = gr.Dropdown( label="Select entry to edit", choices=[], interactive=True ) gr.Markdown("---") gr.Markdown("**Edit the selected entry:**") with gr.Column(visible=False) as edit_form: name_input = gr.Textbox(label="Name *", placeholder="Dataset name") dataset_type_input = gr.Dropdown( label="Dataset Type *", choices=DATASET_TYPES, value="benchmark", ) gr.Markdown("**URLs** (at least one required)") with gr.Row(): github_url_input = gr.Textbox( label="GitHub URL", placeholder="https://github.com/..." ) huggingface_url_input = gr.Textbox( label="Hugging Face URL", placeholder="https://huggingface.co/datasets/...", ) zenodo_url_input = gr.Textbox( label="Zenodo URL", placeholder="https://zenodo.org/..." ) gr.Markdown("**Optional Information**") paper_url_input = gr.Textbox( label="Paper URL", placeholder="https://arxiv.org/..." ) website_url_input = gr.Textbox( label="Website URL", placeholder="https://..." ) task_input = gr.CheckboxGroup( label="Tasks", choices=TASKS, ) domain_input = gr.CheckboxGroup( label="Domain", choices=DOMAINS, ) countries_input = gr.CheckboxGroup( label="Countries", choices=COUNTRIES, ) languages_input = gr.CheckboxGroup( label="Languages", choices=LANGUAGES, ) update_btn = gr.Button("💾 Update Entry", variant="primary") result_msg = gr.Markdown() # Store the original name for updating original_name_state = gr.State("") def search_and_update_dropdown(query): results_df = search_entries(query) if results_df.empty: return gr.Dropdown(choices=[], value=None) else: choices = results_df["name"].tolist() return gr.Dropdown(choices=choices, value=None) def load_entry_and_show_form(selected_entry): if not selected_entry: return (gr.Column(visible=False), "", *[("",) * 11]) entry_data = load_entry_for_edit(selected_entry) return (gr.Column(visible=True), selected_entry, *entry_data) # Event handlers search_btn.click( fn=search_and_update_dropdown, inputs=[search_input], outputs=[search_results], ) search_results.change( fn=load_entry_and_show_form, inputs=[search_results], outputs=[ edit_form, original_name_state, name_input, github_url_input, huggingface_url_input, zenodo_url_input, paper_url_input, dataset_type_input, task_input, domain_input, website_url_input, countries_input, languages_input, ], ) update_btn.click( fn=update_entry, inputs=[ original_name_state, name_input, github_url_input, huggingface_url_input, zenodo_url_input, paper_url_input, dataset_type_input, task_input, domain_input, website_url_input, countries_input, languages_input, ], outputs=[result_msg], ) return ( search_input, search_btn, search_results, edit_form, name_input, dataset_type_input, github_url_input, huggingface_url_input, zenodo_url_input, paper_url_input, website_url_input, task_input, domain_input, countries_input, languages_input, update_btn, result_msg, ) def create_tab(): """Create the complete tab for this resource type.""" with gr.TabItem(f"📊 {RESOURCE_TITLE}", id=RESOURCE_TYPE): with gr.Tabs(): table = create_all_tab() inputs = create_contribute_tab() edit_components = create_edit_tab() return table, inputs, edit_components