Spaces:
Sleeping
Sleeping
| from datetime import datetime, timezone | |
| import gradio as gr | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| from constants import ( | |
| COUNTRIES, | |
| DATASET_TYPES, | |
| DOMAINS, | |
| LANGUAGES, | |
| TASKS, | |
| format_dataframe_for_display, | |
| format_dataframe_for_html_display, | |
| ) | |
| # Dataset configuration | |
| DATASET_NAME = "somosnlp/recursos-pln-es" | |
| CONFIG_NAME = "datasets" | |
| RESOURCE_TYPE = "datasets" | |
| RESOURCE_TITLE = "Datasets" | |
| def load_data() -> pd.DataFrame: | |
| """Load data from HuggingFace dataset or return empty DataFrame.""" | |
| try: | |
| dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") | |
| return dataset.to_pandas() | |
| except Exception as e: | |
| print(f"Could not load {RESOURCE_TYPE} dataset: {e}") | |
| # Return empty DataFrame with required columns | |
| return pd.DataFrame( | |
| columns=[ | |
| "name", | |
| "github_url", | |
| "huggingface_url", | |
| "zenodo_url", | |
| "paper_url", | |
| "dataset_type", | |
| "task", | |
| "domain", | |
| "website_url", | |
| "countries", | |
| "languages", | |
| "submitted_by", | |
| "date_submitted", | |
| ] | |
| ) | |
| def search_and_filter_data(df: pd.DataFrame, search_query: str) -> pd.DataFrame: | |
| """Filter dataframe based on search query.""" | |
| if search_query == "": | |
| return df | |
| else: | |
| filtered_df = df[ | |
| df.apply( | |
| lambda row: row.astype(str) | |
| .str.contains(search_query, case=False) | |
| .any(), | |
| axis=1, | |
| ) | |
| ] | |
| return filtered_df | |
| def validate_url(url: str) -> bool: | |
| """Validate if a string is a valid URL.""" | |
| if not url: | |
| return True # Empty URLs are allowed for optional fields | |
| return url.startswith(("http://", "https://")) | |
| def submit_resource( | |
| name: str, | |
| github_url: str, | |
| huggingface_url: str, | |
| zenodo_url: str, | |
| paper_url: str, | |
| dataset_type: str, | |
| task: list, | |
| domain: list, | |
| website_url: str, | |
| countries: list, | |
| languages: list, | |
| profile: gr.OAuthProfile | None, | |
| ): | |
| """Submit a new resource to the corresponding dataset.""" | |
| # Login required | |
| if profile is None: | |
| return "β Error: You need to be logged in to submit a resource." | |
| # Validate required fields | |
| if not name: | |
| return "β Error: Name is required." | |
| if not dataset_type: | |
| return "β Error: Dataset type is required." | |
| # Validate that at least one URL is provided | |
| if not any([github_url, huggingface_url, zenodo_url]): | |
| return "β Error: At least one of GitHub URL, Hugging Face URL, or Zenodo URL must be provided." | |
| # Validate URLs | |
| urls_to_check = [ | |
| ("GitHub URL", github_url), | |
| ("Hugging Face URL", huggingface_url), | |
| ("Zenodo URL", zenodo_url), | |
| ("Paper URL", paper_url), | |
| ("Website URL", website_url), | |
| ] | |
| for url_name, url_value in urls_to_check: | |
| if url_value and not validate_url(url_value): | |
| return f"β Error: {url_name} must be a valid URL starting with http:// or https://" | |
| try: | |
| username = profile.username | |
| current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | |
| # Create new row data | |
| new_data = { | |
| "name": name, | |
| "github_url": github_url, | |
| "huggingface_url": huggingface_url, | |
| "zenodo_url": zenodo_url, | |
| "paper_url": paper_url, | |
| "dataset_type": dataset_type, | |
| "task": ", ".join(task) if task else "", | |
| "domain": ", ".join(domain) if domain else "", | |
| "website_url": website_url, | |
| "countries": ", ".join(countries) if countries else "", | |
| "languages": ", ".join(languages) if languages else "", | |
| "submitted_by": username, | |
| "date_submitted": current_time, | |
| } | |
| # Try to load existing dataset, or create new one | |
| try: | |
| existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") | |
| existing_df = existing_dataset.to_pandas() | |
| # Add new row | |
| updated_df = pd.concat( | |
| [existing_df, pd.DataFrame([new_data])], ignore_index=True | |
| ) | |
| except: | |
| # Create new dataset if it doesn't exist | |
| updated_df = pd.DataFrame([new_data]) | |
| # Convert back to Dataset and push to hub | |
| updated_dataset = Dataset.from_pandas(updated_df) | |
| updated_dataset.push_to_hub( | |
| DATASET_NAME, | |
| config_name=CONFIG_NAME, | |
| commit_message=f"Add {name} by {username}", | |
| token=True, # Use the user's token | |
| ) | |
| return f"β Success: {name} has been submitted successfully!" | |
| except Exception as e: | |
| return f"β Error: Failed to submit resource. {str(e)}" | |
| def create_all_tab(): | |
| """Create the 'All' tab for this resource type.""" | |
| with gr.TabItem("π All", id=f"{RESOURCE_TYPE}_all"): | |
| gr.Markdown(f"### All {RESOURCE_TITLE}") | |
| search_box = gr.Textbox( | |
| placeholder=f"Search {RESOURCE_TYPE}...", | |
| label="Filter the table", | |
| show_label=False, | |
| ) | |
| # Load and format initial data with clickable links | |
| def get_formatted_data(): | |
| df = load_data() | |
| return format_dataframe_for_display( | |
| df, | |
| url_columns=[ | |
| "github_url", | |
| "huggingface_url", | |
| "zenodo_url", | |
| "paper_url", | |
| "website_url", | |
| ], | |
| hide_columns=["date_submitted"], | |
| ) | |
| # Use Dataframe component with HTML rendering enabled | |
| table = gr.Dataframe( | |
| value=get_formatted_data(), | |
| label=RESOURCE_TITLE, | |
| show_label=False, | |
| interactive=False, | |
| wrap=False, # Disable wrapping to show full text in single lines | |
| datatype="markdown", # Enable HTML rendering | |
| ) | |
| # Connect search functionality | |
| def search_and_format(query): | |
| initial_df = load_data() | |
| filtered_df = search_and_filter_data(initial_df, query) | |
| return format_dataframe_for_display( | |
| filtered_df, | |
| url_columns=[ | |
| "github_url", | |
| "huggingface_url", | |
| "zenodo_url", | |
| "paper_url", | |
| "website_url", | |
| ], | |
| hide_columns=["date_submitted"], | |
| ) | |
| search_box.change( | |
| fn=search_and_format, | |
| inputs=search_box, | |
| outputs=table, | |
| ) | |
| # Refresh button to reload data | |
| refresh_btn = gr.Button("π Refresh Data", variant="secondary") | |
| refresh_btn.click(fn=get_formatted_data, outputs=table) | |
| return table | |
| def create_contribute_tab(): | |
| """Create the 'Contribute' tab for this resource type.""" | |
| with gr.TabItem("β Contribute", id=f"{RESOURCE_TYPE}_contribute"): | |
| gr.Markdown(f"### Contribute a New {RESOURCE_TITLE[:-1]}") | |
| # Login section | |
| gr.Markdown("Please log in to contribute resources:") | |
| login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-oauth-button") | |
| gr.Markdown("Please fill in the information below to add a new dataset:") | |
| with gr.Column(): | |
| # Required fields | |
| name_input = gr.Textbox( | |
| label="Name *", | |
| placeholder="Enter the name of the dataset", | |
| info="The name or title of the dataset (required)", | |
| ) | |
| dataset_type_input = gr.Dropdown( | |
| label="Dataset Type *", | |
| choices=DATASET_TYPES, | |
| info="Type of dataset (required)", | |
| multiselect=False, | |
| ) | |
| # URL fields (at least one required) | |
| gr.Markdown("**URLs** (at least one required)") | |
| with gr.Row(): | |
| github_url_input = gr.Textbox( | |
| label="GitHub URL", | |
| placeholder="https://github.com/...", | |
| info="GitHub repository URL", | |
| ) | |
| huggingface_url_input = gr.Textbox( | |
| label="Hugging Face URL", | |
| placeholder="https://huggingface.co/datasets/...", | |
| info="Hugging Face dataset URL", | |
| ) | |
| zenodo_url_input = gr.Textbox( | |
| label="Zenodo URL", | |
| placeholder="https://zenodo.org/...", | |
| info="Zenodo repository URL", | |
| ) | |
| # Optional fields | |
| gr.Markdown("**Optional Information**") | |
| paper_url_input = gr.Textbox( | |
| label="Paper URL", | |
| placeholder="https://...", | |
| info="Link to associated research paper", | |
| ) | |
| website_url_input = gr.Textbox( | |
| label="Website URL", | |
| placeholder="https://...", | |
| info="Project or dataset website", | |
| ) | |
| # Multi-select fields | |
| task_input = gr.CheckboxGroup( | |
| label="Tasks", | |
| choices=TASKS, | |
| info="What tasks is this dataset suitable for?", | |
| ) | |
| domain_input = gr.CheckboxGroup( | |
| label="Domain", | |
| choices=DOMAINS, | |
| info="Specific domains covered by the dataset", | |
| ) | |
| countries_input = gr.CheckboxGroup( | |
| label="Countries", | |
| choices=[ | |
| "Spain", | |
| "Mexico", | |
| "Argentina", | |
| "Colombia", | |
| "Peru", | |
| "Venezuela", | |
| "Chile", | |
| "Ecuador", | |
| "Guatemala", | |
| "Cuba", | |
| "Bolivia", | |
| "Dominican Republic", | |
| "Honduras", | |
| "Paraguay", | |
| "El Salvador", | |
| "Nicaragua", | |
| "Costa Rica", | |
| "Panama", | |
| "Uruguay", | |
| "Puerto Rico", | |
| "Brazil", | |
| "Portugal", | |
| ], | |
| info="Countries where Spanish or Portuguese are spoken", | |
| ) | |
| languages_input = gr.CheckboxGroup( | |
| label="Languages", | |
| choices=[ | |
| "spanish", | |
| "portuguese", | |
| "basque", | |
| "catalan", | |
| "galician", | |
| "guarani", | |
| "quechua", | |
| ], | |
| info="Languages included in the dataset", | |
| ) | |
| submit_btn = gr.Button(f"Submit {RESOURCE_TITLE[:-1]}", variant="primary") | |
| result_msg = gr.Markdown() | |
| # Submit function | |
| submit_btn.click( | |
| fn=submit_resource, | |
| inputs=[ | |
| name_input, | |
| github_url_input, | |
| huggingface_url_input, | |
| zenodo_url_input, | |
| paper_url_input, | |
| dataset_type_input, | |
| task_input, | |
| domain_input, | |
| website_url_input, | |
| countries_input, | |
| languages_input, | |
| ], | |
| outputs=[result_msg], | |
| ) | |
| return ( | |
| name_input, | |
| github_url_input, | |
| huggingface_url_input, | |
| zenodo_url_input, | |
| paper_url_input, | |
| dataset_type_input, | |
| task_input, | |
| domain_input, | |
| website_url_input, | |
| countries_input, | |
| languages_input, | |
| submit_btn, | |
| result_msg, | |
| ) | |
| def search_entries(query: str) -> pd.DataFrame: | |
| """Search for entries by name or URL.""" | |
| if not query.strip(): | |
| return pd.DataFrame() | |
| df = load_data() | |
| if df.empty: | |
| return df | |
| # Search in name, github_url, huggingface_url, and zenodo_url columns | |
| mask = ( | |
| df["name"].str.contains(query, case=False, na=False) | |
| | df["github_url"].str.contains(query, case=False, na=False) | |
| | df["huggingface_url"].str.contains(query, case=False, na=False) | |
| | df["zenodo_url"].str.contains(query, case=False, na=False) | |
| ) | |
| return df[mask] | |
| def load_entry_for_edit(selected_entry: str) -> tuple: | |
| """Load a specific entry for editing.""" | |
| if not selected_entry: | |
| return ("",) * 11 # Return empty values for all fields | |
| df = load_data() | |
| if df.empty: | |
| return ("",) * 11 | |
| # Find the entry by name | |
| entry = df[df["name"] == selected_entry].iloc[0] | |
| # Convert comma-separated strings back to lists for multi-select components | |
| task_list = [t.strip() for t in entry["task"].split(",")] if entry["task"] else [] | |
| domain_list = ( | |
| [d.strip() for d in entry["domain"].split(",")] if entry["domain"] else [] | |
| ) | |
| countries_list = ( | |
| [c.strip() for c in entry["countries"].split(",")] if entry["countries"] else [] | |
| ) | |
| languages_list = ( | |
| [l.strip() for l in entry["languages"].split(",")] if entry["languages"] else [] | |
| ) | |
| return ( | |
| entry["name"], | |
| entry["github_url"], | |
| entry["huggingface_url"], | |
| entry["zenodo_url"], | |
| entry["paper_url"], | |
| entry["dataset_type"], | |
| task_list, | |
| domain_list, | |
| entry["website_url"], | |
| countries_list, | |
| languages_list, | |
| ) | |
| def update_entry( | |
| original_name: str, | |
| name: str, | |
| github_url: str, | |
| huggingface_url: str, | |
| zenodo_url: str, | |
| paper_url: str, | |
| dataset_type: str, | |
| task: list, | |
| domain: list, | |
| website_url: str, | |
| countries: list, | |
| languages: list, | |
| profile: gr.OAuthProfile | None, | |
| ): | |
| """Update an existing entry.""" | |
| # Login required | |
| if profile is None: | |
| return "β Error: You need to be logged in to edit entries." | |
| username = profile.username | |
| if not username: | |
| return "β Could not get username from profile." | |
| if not original_name: | |
| return "β No entry selected to edit." | |
| if not name.strip(): | |
| return "β Name is required." | |
| # Validate that at least one URL is provided | |
| urls = [github_url.strip(), huggingface_url.strip(), zenodo_url.strip()] | |
| if not any(urls): | |
| return "β At least one URL (GitHub, Hugging Face, or Zenodo) is required." | |
| # Validate URLs | |
| for url_field, url_value in [ | |
| ("GitHub URL", github_url), | |
| ("Hugging Face URL", huggingface_url), | |
| ("Zenodo URL", zenodo_url), | |
| ("Paper URL", paper_url), | |
| ("Website URL", website_url), | |
| ]: | |
| if url_value.strip() and not validate_url(url_value): | |
| return f"β Invalid {url_field}. Please provide a valid URL." | |
| try: | |
| # Load existing dataset | |
| existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train") | |
| existing_df = existing_dataset.to_pandas() | |
| # Find and update the entry | |
| mask = existing_df["name"] == original_name | |
| if not mask.any(): | |
| return f"β Entry '{original_name}' not found." | |
| # Update the entry | |
| current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | |
| existing_df.loc[mask, "name"] = name | |
| existing_df.loc[mask, "github_url"] = github_url | |
| existing_df.loc[mask, "huggingface_url"] = huggingface_url | |
| existing_df.loc[mask, "zenodo_url"] = zenodo_url | |
| existing_df.loc[mask, "paper_url"] = paper_url | |
| existing_df.loc[mask, "dataset_type"] = dataset_type | |
| existing_df.loc[mask, "task"] = ", ".join(task) if task else "" | |
| existing_df.loc[mask, "domain"] = ", ".join(domain) if domain else "" | |
| existing_df.loc[mask, "website_url"] = website_url | |
| existing_df.loc[mask, "countries"] = ", ".join(countries) if countries else "" | |
| existing_df.loc[mask, "languages"] = ", ".join(languages) if languages else "" | |
| existing_df.loc[mask, "date_submitted"] = current_time | |
| # Convert back to Dataset and push to hub | |
| updated_dataset = Dataset.from_pandas(existing_df) | |
| updated_dataset.push_to_hub( | |
| DATASET_NAME, | |
| config_name=CONFIG_NAME, | |
| commit_message=f"Update dataset entry: {name} (edited by {username})", | |
| ) | |
| return f"β Successfully updated '{name}'!" | |
| except Exception as e: | |
| return f"β Error updating entry: {str(e)}" | |
| def create_edit_tab(): | |
| """Create the edit tab for modifying existing entries.""" | |
| with gr.TabItem("βοΈ Edit", id=f"{RESOURCE_TYPE}_edit"): | |
| gr.Markdown(f"### Edit Existing {RESOURCE_TITLE}") | |
| gr.Markdown("Please log in to edit entries:") | |
| login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-edit-oauth-button") | |
| gr.Markdown("Search for an entry to edit:") | |
| with gr.Row(): | |
| search_input = gr.Textbox( | |
| label="Search by name or URL", | |
| placeholder="Enter dataset name or URL to search...", | |
| scale=3, | |
| ) | |
| search_btn = gr.Button("π Search", scale=1) | |
| search_results = gr.Dropdown( | |
| label="Select entry to edit", choices=[], interactive=True | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("**Edit the selected entry:**") | |
| with gr.Column(visible=False) as edit_form: | |
| name_input = gr.Textbox(label="Name *", placeholder="Dataset name") | |
| dataset_type_input = gr.Dropdown( | |
| label="Dataset Type *", | |
| choices=DATASET_TYPES, | |
| value="benchmark", | |
| ) | |
| gr.Markdown("**URLs** (at least one required)") | |
| with gr.Row(): | |
| github_url_input = gr.Textbox( | |
| label="GitHub URL", placeholder="https://github.com/..." | |
| ) | |
| huggingface_url_input = gr.Textbox( | |
| label="Hugging Face URL", | |
| placeholder="https://huggingface.co/datasets/...", | |
| ) | |
| zenodo_url_input = gr.Textbox( | |
| label="Zenodo URL", placeholder="https://zenodo.org/..." | |
| ) | |
| gr.Markdown("**Optional Information**") | |
| paper_url_input = gr.Textbox( | |
| label="Paper URL", placeholder="https://arxiv.org/..." | |
| ) | |
| website_url_input = gr.Textbox( | |
| label="Website URL", placeholder="https://..." | |
| ) | |
| task_input = gr.CheckboxGroup( | |
| label="Tasks", | |
| choices=TASKS, | |
| ) | |
| domain_input = gr.CheckboxGroup( | |
| label="Domain", | |
| choices=DOMAINS, | |
| ) | |
| countries_input = gr.CheckboxGroup( | |
| label="Countries", | |
| choices=COUNTRIES, | |
| ) | |
| languages_input = gr.CheckboxGroup( | |
| label="Languages", | |
| choices=LANGUAGES, | |
| ) | |
| update_btn = gr.Button("πΎ Update Entry", variant="primary") | |
| result_msg = gr.Markdown() | |
| # Store the original name for updating | |
| original_name_state = gr.State("") | |
| def search_and_update_dropdown(query): | |
| results_df = search_entries(query) | |
| if results_df.empty: | |
| return gr.Dropdown(choices=[], value=None) | |
| else: | |
| choices = results_df["name"].tolist() | |
| return gr.Dropdown(choices=choices, value=None) | |
| def load_entry_and_show_form(selected_entry): | |
| if not selected_entry: | |
| return (gr.Column(visible=False), "", *[("",) * 11]) | |
| entry_data = load_entry_for_edit(selected_entry) | |
| return (gr.Column(visible=True), selected_entry, *entry_data) | |
| # Event handlers | |
| search_btn.click( | |
| fn=search_and_update_dropdown, | |
| inputs=[search_input], | |
| outputs=[search_results], | |
| ) | |
| search_results.change( | |
| fn=load_entry_and_show_form, | |
| inputs=[search_results], | |
| outputs=[ | |
| edit_form, | |
| original_name_state, | |
| name_input, | |
| github_url_input, | |
| huggingface_url_input, | |
| zenodo_url_input, | |
| paper_url_input, | |
| dataset_type_input, | |
| task_input, | |
| domain_input, | |
| website_url_input, | |
| countries_input, | |
| languages_input, | |
| ], | |
| ) | |
| update_btn.click( | |
| fn=update_entry, | |
| inputs=[ | |
| original_name_state, | |
| name_input, | |
| github_url_input, | |
| huggingface_url_input, | |
| zenodo_url_input, | |
| paper_url_input, | |
| dataset_type_input, | |
| task_input, | |
| domain_input, | |
| website_url_input, | |
| countries_input, | |
| languages_input, | |
| ], | |
| outputs=[result_msg], | |
| ) | |
| return ( | |
| search_input, | |
| search_btn, | |
| search_results, | |
| edit_form, | |
| name_input, | |
| dataset_type_input, | |
| github_url_input, | |
| huggingface_url_input, | |
| zenodo_url_input, | |
| paper_url_input, | |
| website_url_input, | |
| task_input, | |
| domain_input, | |
| countries_input, | |
| languages_input, | |
| update_btn, | |
| result_msg, | |
| ) | |
| def create_tab(): | |
| """Create the complete tab for this resource type.""" | |
| with gr.TabItem(f"π {RESOURCE_TITLE}", id=RESOURCE_TYPE): | |
| with gr.Tabs(): | |
| table = create_all_tab() | |
| inputs = create_contribute_tab() | |
| edit_components = create_edit_tab() | |
| return table, inputs, edit_components | |