Spaces:

mariagrandury
/

recursos-pln-es

Sleeping

File size: 22,380 Bytes

from datetime import datetime, timezone

import gradio as gr
import pandas as pd
from datasets import Dataset, load_dataset

from constants import (
    COUNTRIES,
    DATASET_TYPES,
    DOMAINS,
    LANGUAGES,
    TASKS,
    format_dataframe_for_display,
    format_dataframe_for_html_display,
)

# Dataset configuration
DATASET_NAME = "somosnlp/recursos-pln-es"
CONFIG_NAME = "datasets"
RESOURCE_TYPE = "datasets"
RESOURCE_TITLE = "Datasets"


def load_data() -> pd.DataFrame:
    """Load data from HuggingFace dataset or return empty DataFrame."""
    try:
        dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
        return dataset.to_pandas()
    except Exception as e:
        print(f"Could not load {RESOURCE_TYPE} dataset: {e}")
        # Return empty DataFrame with required columns
        return pd.DataFrame(
            columns=[
                "name",
                "github_url",
                "huggingface_url",
                "zenodo_url",
                "paper_url",
                "dataset_type",
                "task",
                "domain",
                "website_url",
                "countries",
                "languages",
                "submitted_by",
                "date_submitted",
            ]
        )


def search_and_filter_data(df: pd.DataFrame, search_query: str) -> pd.DataFrame:
    """Filter dataframe based on search query."""
    if search_query == "":
        return df
    else:
        filtered_df = df[
            df.apply(
                lambda row: row.astype(str)
                .str.contains(search_query, case=False)
                .any(),
                axis=1,
            )
        ]
        return filtered_df


def validate_url(url: str) -> bool:
    """Validate if a string is a valid URL."""
    if not url:
        return True  # Empty URLs are allowed for optional fields
    return url.startswith(("http://", "https://"))


def submit_resource(
    name: str,
    github_url: str,
    huggingface_url: str,
    zenodo_url: str,
    paper_url: str,
    dataset_type: str,
    task: list,
    domain: list,
    website_url: str,
    countries: list,
    languages: list,
    profile: gr.OAuthProfile | None,
):
    """Submit a new resource to the corresponding dataset."""

    # Login required
    if profile is None:
        return "❌ Error: You need to be logged in to submit a resource."

    # Validate required fields
    if not name:
        return "❌ Error: Name is required."

    if not dataset_type:
        return "❌ Error: Dataset type is required."

    # Validate that at least one URL is provided
    if not any([github_url, huggingface_url, zenodo_url]):
        return "❌ Error: At least one of GitHub URL, Hugging Face URL, or Zenodo URL must be provided."

    # Validate URLs
    urls_to_check = [
        ("GitHub URL", github_url),
        ("Hugging Face URL", huggingface_url),
        ("Zenodo URL", zenodo_url),
        ("Paper URL", paper_url),
        ("Website URL", website_url),
    ]

    for url_name, url_value in urls_to_check:
        if url_value and not validate_url(url_value):
            return f"❌ Error: {url_name} must be a valid URL starting with http:// or https://"

    try:
        username = profile.username
        current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

        # Create new row data
        new_data = {
            "name": name,
            "github_url": github_url,
            "huggingface_url": huggingface_url,
            "zenodo_url": zenodo_url,
            "paper_url": paper_url,
            "dataset_type": dataset_type,
            "task": ", ".join(task) if task else "",
            "domain": ", ".join(domain) if domain else "",
            "website_url": website_url,
            "countries": ", ".join(countries) if countries else "",
            "languages": ", ".join(languages) if languages else "",
            "submitted_by": username,
            "date_submitted": current_time,
        }

        # Try to load existing dataset, or create new one
        try:
            existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
            existing_df = existing_dataset.to_pandas()
            # Add new row
            updated_df = pd.concat(
                [existing_df, pd.DataFrame([new_data])], ignore_index=True
            )
        except:
            # Create new dataset if it doesn't exist
            updated_df = pd.DataFrame([new_data])

        # Convert back to Dataset and push to hub
        updated_dataset = Dataset.from_pandas(updated_df)
        updated_dataset.push_to_hub(
            DATASET_NAME,
            config_name=CONFIG_NAME,
            commit_message=f"Add {name} by {username}",
            token=True,  # Use the user's token
        )

        return f"✅ Success: {name} has been submitted successfully!"

    except Exception as e:
        return f"❌ Error: Failed to submit resource. {str(e)}"


def create_all_tab():
    """Create the 'All' tab for this resource type."""
    with gr.TabItem("📋 All", id=f"{RESOURCE_TYPE}_all"):
        gr.Markdown(f"### All {RESOURCE_TITLE}")

        search_box = gr.Textbox(
            placeholder=f"Search {RESOURCE_TYPE}...",
            label="Filter the table",
            show_label=False,
        )

        # Load and format initial data with clickable links
        def get_formatted_data():
            df = load_data()
            return format_dataframe_for_display(
                df,
                url_columns=[
                    "github_url",
                    "huggingface_url",
                    "zenodo_url",
                    "paper_url",
                    "website_url",
                ],
                hide_columns=["date_submitted"],
            )

        # Use Dataframe component with HTML rendering enabled
        table = gr.Dataframe(
            value=get_formatted_data(),
            label=RESOURCE_TITLE,
            show_label=False,
            interactive=False,
            wrap=False,  # Disable wrapping to show full text in single lines
            datatype="markdown",  # Enable HTML rendering
        )

        # Connect search functionality
        def search_and_format(query):
            initial_df = load_data()
            filtered_df = search_and_filter_data(initial_df, query)
            return format_dataframe_for_display(
                filtered_df,
                url_columns=[
                    "github_url",
                    "huggingface_url",
                    "zenodo_url",
                    "paper_url",
                    "website_url",
                ],
                hide_columns=["date_submitted"],
            )

        search_box.change(
            fn=search_and_format,
            inputs=search_box,
            outputs=table,
        )

        # Refresh button to reload data
        refresh_btn = gr.Button("🔄 Refresh Data", variant="secondary")
        refresh_btn.click(fn=get_formatted_data, outputs=table)

        return table


def create_contribute_tab():
    """Create the 'Contribute' tab for this resource type."""
    with gr.TabItem("➕ Contribute", id=f"{RESOURCE_TYPE}_contribute"):
        gr.Markdown(f"### Contribute a New {RESOURCE_TITLE[:-1]}")

        # Login section
        gr.Markdown("Please log in to contribute resources:")
        login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-oauth-button")

        gr.Markdown("Please fill in the information below to add a new dataset:")

        with gr.Column():
            # Required fields
            name_input = gr.Textbox(
                label="Name *",
                placeholder="Enter the name of the dataset",
                info="The name or title of the dataset (required)",
            )

            dataset_type_input = gr.Dropdown(
                label="Dataset Type *",
                choices=DATASET_TYPES,
                info="Type of dataset (required)",
                multiselect=False,
            )

            # URL fields (at least one required)
            gr.Markdown("**URLs** (at least one required)")
            with gr.Row():
                github_url_input = gr.Textbox(
                    label="GitHub URL",
                    placeholder="https://github.com/...",
                    info="GitHub repository URL",
                )
                huggingface_url_input = gr.Textbox(
                    label="Hugging Face URL",
                    placeholder="https://huggingface.co/datasets/...",
                    info="Hugging Face dataset URL",
                )

            zenodo_url_input = gr.Textbox(
                label="Zenodo URL",
                placeholder="https://zenodo.org/...",
                info="Zenodo repository URL",
            )

            # Optional fields
            gr.Markdown("**Optional Information**")
            paper_url_input = gr.Textbox(
                label="Paper URL",
                placeholder="https://...",
                info="Link to associated research paper",
            )

            website_url_input = gr.Textbox(
                label="Website URL",
                placeholder="https://...",
                info="Project or dataset website",
            )

            # Multi-select fields
            task_input = gr.CheckboxGroup(
                label="Tasks",
                choices=TASKS,
                info="What tasks is this dataset suitable for?",
            )

            domain_input = gr.CheckboxGroup(
                label="Domain",
                choices=DOMAINS,
                info="Specific domains covered by the dataset",
            )

            countries_input = gr.CheckboxGroup(
                label="Countries",
                choices=[
                    "Spain",
                    "Mexico",
                    "Argentina",
                    "Colombia",
                    "Peru",
                    "Venezuela",
                    "Chile",
                    "Ecuador",
                    "Guatemala",
                    "Cuba",
                    "Bolivia",
                    "Dominican Republic",
                    "Honduras",
                    "Paraguay",
                    "El Salvador",
                    "Nicaragua",
                    "Costa Rica",
                    "Panama",
                    "Uruguay",
                    "Puerto Rico",
                    "Brazil",
                    "Portugal",
                ],
                info="Countries where Spanish or Portuguese are spoken",
            )

            languages_input = gr.CheckboxGroup(
                label="Languages",
                choices=[
                    "spanish",
                    "portuguese",
                    "basque",
                    "catalan",
                    "galician",
                    "guarani",
                    "quechua",
                ],
                info="Languages included in the dataset",
            )

        submit_btn = gr.Button(f"Submit {RESOURCE_TITLE[:-1]}", variant="primary")
        result_msg = gr.Markdown()

        # Submit function
        submit_btn.click(
            fn=submit_resource,
            inputs=[
                name_input,
                github_url_input,
                huggingface_url_input,
                zenodo_url_input,
                paper_url_input,
                dataset_type_input,
                task_input,
                domain_input,
                website_url_input,
                countries_input,
                languages_input,
            ],
            outputs=[result_msg],
        )

        return (
            name_input,
            github_url_input,
            huggingface_url_input,
            zenodo_url_input,
            paper_url_input,
            dataset_type_input,
            task_input,
            domain_input,
            website_url_input,
            countries_input,
            languages_input,
            submit_btn,
            result_msg,
        )


def search_entries(query: str) -> pd.DataFrame:
    """Search for entries by name or URL."""
    if not query.strip():
        return pd.DataFrame()

    df = load_data()
    if df.empty:
        return df

    # Search in name, github_url, huggingface_url, and zenodo_url columns
    mask = (
        df["name"].str.contains(query, case=False, na=False)
        | df["github_url"].str.contains(query, case=False, na=False)
        | df["huggingface_url"].str.contains(query, case=False, na=False)
        | df["zenodo_url"].str.contains(query, case=False, na=False)
    )

    return df[mask]


def load_entry_for_edit(selected_entry: str) -> tuple:
    """Load a specific entry for editing."""
    if not selected_entry:
        return ("",) * 11  # Return empty values for all fields

    df = load_data()
    if df.empty:
        return ("",) * 11

    # Find the entry by name
    entry = df[df["name"] == selected_entry].iloc[0]

    # Convert comma-separated strings back to lists for multi-select components
    task_list = [t.strip() for t in entry["task"].split(",")] if entry["task"] else []
    domain_list = (
        [d.strip() for d in entry["domain"].split(",")] if entry["domain"] else []
    )
    countries_list = (
        [c.strip() for c in entry["countries"].split(",")] if entry["countries"] else []
    )
    languages_list = (
        [l.strip() for l in entry["languages"].split(",")] if entry["languages"] else []
    )

    return (
        entry["name"],
        entry["github_url"],
        entry["huggingface_url"],
        entry["zenodo_url"],
        entry["paper_url"],
        entry["dataset_type"],
        task_list,
        domain_list,
        entry["website_url"],
        countries_list,
        languages_list,
    )


def update_entry(
    original_name: str,
    name: str,
    github_url: str,
    huggingface_url: str,
    zenodo_url: str,
    paper_url: str,
    dataset_type: str,
    task: list,
    domain: list,
    website_url: str,
    countries: list,
    languages: list,
    profile: gr.OAuthProfile | None,
):
    """Update an existing entry."""
    # Login required
    if profile is None:
        return "❌ Error: You need to be logged in to edit entries."

    username = profile.username
    if not username:
        return "❌ Could not get username from profile."

    if not original_name:
        return "❌ No entry selected to edit."

    if not name.strip():
        return "❌ Name is required."

    # Validate that at least one URL is provided
    urls = [github_url.strip(), huggingface_url.strip(), zenodo_url.strip()]
    if not any(urls):
        return "❌ At least one URL (GitHub, Hugging Face, or Zenodo) is required."

    # Validate URLs
    for url_field, url_value in [
        ("GitHub URL", github_url),
        ("Hugging Face URL", huggingface_url),
        ("Zenodo URL", zenodo_url),
        ("Paper URL", paper_url),
        ("Website URL", website_url),
    ]:
        if url_value.strip() and not validate_url(url_value):
            return f"❌ Invalid {url_field}. Please provide a valid URL."

    try:
        # Load existing dataset
        existing_dataset = load_dataset(DATASET_NAME, CONFIG_NAME, split="train")
        existing_df = existing_dataset.to_pandas()

        # Find and update the entry
        mask = existing_df["name"] == original_name
        if not mask.any():
            return f"❌ Entry '{original_name}' not found."

        # Update the entry
        current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

        existing_df.loc[mask, "name"] = name
        existing_df.loc[mask, "github_url"] = github_url
        existing_df.loc[mask, "huggingface_url"] = huggingface_url
        existing_df.loc[mask, "zenodo_url"] = zenodo_url
        existing_df.loc[mask, "paper_url"] = paper_url
        existing_df.loc[mask, "dataset_type"] = dataset_type
        existing_df.loc[mask, "task"] = ", ".join(task) if task else ""
        existing_df.loc[mask, "domain"] = ", ".join(domain) if domain else ""
        existing_df.loc[mask, "website_url"] = website_url
        existing_df.loc[mask, "countries"] = ", ".join(countries) if countries else ""
        existing_df.loc[mask, "languages"] = ", ".join(languages) if languages else ""
        existing_df.loc[mask, "date_submitted"] = current_time

        # Convert back to Dataset and push to hub
        updated_dataset = Dataset.from_pandas(existing_df)
        updated_dataset.push_to_hub(
            DATASET_NAME,
            config_name=CONFIG_NAME,
            commit_message=f"Update dataset entry: {name} (edited by {username})",
        )

        return f"✅ Successfully updated '{name}'!"

    except Exception as e:
        return f"❌ Error updating entry: {str(e)}"


def create_edit_tab():
    """Create the edit tab for modifying existing entries."""
    with gr.TabItem("✏️ Edit", id=f"{RESOURCE_TYPE}_edit"):
        gr.Markdown(f"### Edit Existing {RESOURCE_TITLE}")
        gr.Markdown("Please log in to edit entries:")
        login_button = gr.LoginButton(elem_id=f"{RESOURCE_TYPE}-edit-oauth-button")

        gr.Markdown("Search for an entry to edit:")

        with gr.Row():
            search_input = gr.Textbox(
                label="Search by name or URL",
                placeholder="Enter dataset name or URL to search...",
                scale=3,
            )
            search_btn = gr.Button("🔍 Search", scale=1)

        search_results = gr.Dropdown(
            label="Select entry to edit", choices=[], interactive=True
        )

        gr.Markdown("---")
        gr.Markdown("**Edit the selected entry:**")

        with gr.Column(visible=False) as edit_form:
            name_input = gr.Textbox(label="Name *", placeholder="Dataset name")
            dataset_type_input = gr.Dropdown(
                label="Dataset Type *",
                choices=DATASET_TYPES,
                value="benchmark",
            )

            gr.Markdown("**URLs** (at least one required)")
            with gr.Row():
                github_url_input = gr.Textbox(
                    label="GitHub URL", placeholder="https://github.com/..."
                )
                huggingface_url_input = gr.Textbox(
                    label="Hugging Face URL",
                    placeholder="https://huggingface.co/datasets/...",
                )
            zenodo_url_input = gr.Textbox(
                label="Zenodo URL", placeholder="https://zenodo.org/..."
            )

            gr.Markdown("**Optional Information**")
            paper_url_input = gr.Textbox(
                label="Paper URL", placeholder="https://arxiv.org/..."
            )
            website_url_input = gr.Textbox(
                label="Website URL", placeholder="https://..."
            )

            task_input = gr.CheckboxGroup(
                label="Tasks",
                choices=TASKS,
            )

            domain_input = gr.CheckboxGroup(
                label="Domain",
                choices=DOMAINS,
            )

            countries_input = gr.CheckboxGroup(
                label="Countries",
                choices=COUNTRIES,
            )

            languages_input = gr.CheckboxGroup(
                label="Languages",
                choices=LANGUAGES,
            )

            update_btn = gr.Button("💾 Update Entry", variant="primary")
            result_msg = gr.Markdown()

        # Store the original name for updating
        original_name_state = gr.State("")

        def search_and_update_dropdown(query):
            results_df = search_entries(query)
            if results_df.empty:
                return gr.Dropdown(choices=[], value=None)
            else:
                choices = results_df["name"].tolist()
                return gr.Dropdown(choices=choices, value=None)

        def load_entry_and_show_form(selected_entry):
            if not selected_entry:
                return (gr.Column(visible=False), "", *[("",) * 11])

            entry_data = load_entry_for_edit(selected_entry)
            return (gr.Column(visible=True), selected_entry, *entry_data)

        # Event handlers
        search_btn.click(
            fn=search_and_update_dropdown,
            inputs=[search_input],
            outputs=[search_results],
        )

        search_results.change(
            fn=load_entry_and_show_form,
            inputs=[search_results],
            outputs=[
                edit_form,
                original_name_state,
                name_input,
                github_url_input,
                huggingface_url_input,
                zenodo_url_input,
                paper_url_input,
                dataset_type_input,
                task_input,
                domain_input,
                website_url_input,
                countries_input,
                languages_input,
            ],
        )

        update_btn.click(
            fn=update_entry,
            inputs=[
                original_name_state,
                name_input,
                github_url_input,
                huggingface_url_input,
                zenodo_url_input,
                paper_url_input,
                dataset_type_input,
                task_input,
                domain_input,
                website_url_input,
                countries_input,
                languages_input,
            ],
            outputs=[result_msg],
        )

        return (
            search_input,
            search_btn,
            search_results,
            edit_form,
            name_input,
            dataset_type_input,
            github_url_input,
            huggingface_url_input,
            zenodo_url_input,
            paper_url_input,
            website_url_input,
            task_input,
            domain_input,
            countries_input,
            languages_input,
            update_btn,
            result_msg,
        )


def create_tab():
    """Create the complete tab for this resource type."""
    with gr.TabItem(f"📊 {RESOURCE_TITLE}", id=RESOURCE_TYPE):
        with gr.Tabs():
            table = create_all_tab()
            inputs = create_contribute_tab()
            edit_components = create_edit_tab()
            return table, inputs, edit_components