"""
Shared constants for the NLP resources application.
Contains lists of countries, languages, domains, tasks, and utility functions.
"""

import re
from urllib.parse import urlparse

import pandas as pd

# Countries where Spanish is spoken
COUNTRIES = [
    "Spain",
    "Mexico",
    "Argentina",
    "Colombia",
    "Peru",
    "Venezuela",
    "Chile",
    "Ecuador",
    "Guatemala",
    "Cuba",
    "Bolivia",
    "Dominican Republic",
    "Honduras",
    "Paraguay",
    "El Salvador",
    "Nicaragua",
    "Costa Rica",
    "Panama",
    "Uruguay",
    "Puerto Rico",
    "Portugal",
    "Brazil",
    "International",
]

# Languages
LANGUAGES = [
    "spanish",
    "catalan",
    "basque",
    "galician",
    "guarani",
    "quechua",
    "aymara",
    "nauhatl",
    "mapudungun",
    "spain",
    "latam",
    "portuguese",
    "all",
]

# NLP tasks
TASKS = [
    "text classification",
    "sentiment analysis",
    "named entity recognition",
    "part-of-speech tagging",
    "question answering",
    "text summarization",
    "machine translation",
    "language modeling",
    "text generation",
    "information extraction",
    "semantic similarity",
    "natural language inference",
]

# Domains for datasets
DOMAINS = [
    "clinical",
    "legal",
    "financial",
    "scientific",
    "news",
    "social media",
    "literature",
    "general",
    "academic",
]

# Dataset types
DATASET_TYPES = ["pretraining", "benchmark", "supervised fine-tuning", "alignment"]

# Event types
EVENT_TYPES = ["workshop", "talk", "AMA", "round table"]

# Initiative types
INITIATIVE_TYPES = [
    "project",
    "event",
    "research group",
    "community",
    "research institute",
    "non-profit",
    "OS company",
]

# Technical levels (for events)
TECHNICAL_LEVELS = ["1", "2", "3", "4", "5"]


def format_url_for_display(url: str) -> str:
    """
    Format URL for display in tables - show only the meaningful part.

    Args:
        url: Full URL string

    Returns:
        Shortened, readable version of the URL
    """
    if not url or not url.strip():
        return ""

    url = url.strip()

    # Remove protocol
    if url.startswith(("http://", "https://")):
        url = url.split("://", 1)[1]

    # Special handling for common domains
    if "huggingface.co" in url:
        # Extract the meaningful part after huggingface.co
        if "/datasets/" in url:
            return url.split("/datasets/")[-1]
        elif "/models/" in url:
            return url.split("/models/")[-1]
        elif "/collections/" in url:
            return url.split("/collections/")[-1]
        else:
            # Return everything after huggingface.co/
            parts = url.split("huggingface.co/")
            return parts[-1] if len(parts) > 1 else url

    elif "github.com" in url:
        # Extract repo name (owner/repo)
        parts = url.split("github.com/")
        if len(parts) > 1:
            repo_path = parts[-1].split("/")
            if len(repo_path) >= 2:
                return f"{repo_path[0]}/{repo_path[1]}"
        return url

    elif "zenodo.org" in url:
        # Extract record ID
        if "/record/" in url:
            return f"zenodo:{url.split('/record/')[-1].split('/')[0]}"
        return url

    elif "arxiv.org" in url:
        # Extract arXiv ID
        if "/abs/" in url:
            return f"arXiv:{url.split('/abs/')[-1]}"
        elif "/pdf/" in url:
            return f"arXiv:{url.split('/pdf/')[-1].replace('.pdf', '')}"
        return url

    elif "youtube.com" in url or "youtu.be" in url:
        # Extract video ID or title if available
        if "watch?v=" in url:
            video_id = url.split("watch?v=")[-1].split("&")[0]
            return f"YouTube:{video_id[:8]}..."
        elif "youtu.be/" in url:
            video_id = url.split("youtu.be/")[-1].split("?")[0]
            return f"YouTube:{video_id[:8]}..."
        return url

    else:
        # For other URLs, try to extract domain and path
        try:
            parsed = urlparse(
                f"https://{url}" if not url.startswith(("http://", "https://")) else url
            )
            domain = parsed.netloc
            path = parsed.path.strip("/")

            if path:
                # Show domain + first part of path
                path_parts = path.split("/")
                if len(path_parts) > 0 and path_parts[0]:
                    return f"{domain}/{path_parts[0]}"

            return domain
        except:
            # Fallback: limit length
            return url[:30] + "..." if len(url) > 30 else url


def make_url_clickable(url: str, display_text: str = None) -> str:
    """
    Convert URL to clickable HTML link.

    Args:
        url: Full URL
        display_text: Text to display for the link (optional)

    Returns:
        HTML link string
    """
    # Handle non-string types (like float NaN values)
    if url is None or (isinstance(url, float) and pd.isna(url)):
        return ""

    url = str(url).strip()

    if not url or url.lower() in ["nan", "none", ""]:
        return ""

    # Ensure URL has protocol
    if not url.startswith(("http://", "https://")):
        url = f"https://{url}"

    # Use provided display text or format the URL
    text = display_text if display_text else format_url_for_display(url)

    return f'<a target="_blank" href="{url}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">{text}</a>'


def get_column_display_names():
    """
    Return mapping of column names to pretty display names.

    Returns:
        Dictionary mapping column names to display names
    """
    return {
        # Common fields
        "name": "Name",
        "submitted_by": "Submitted By",
        "date_submitted": "Date Submitted",
        # Dataset fields
        "github_url": "GitHub",
        "huggingface_url": "HF Dataset",
        "zenodo_url": "Zenodo",
        "paper_url": "Paper",
        "website_url": "Website",
        "dataset_type": "Type",
        "task": "Tasks",
        "domain": "Domain",
        "countries": "Countries",
        "languages": "Languages",
        # Model fields
        "familia": "Family",
        "available_sizes": "Sizes (B)",
        "hf_collection_url": "HF Collection",
        # Event fields
        "titulo": "Title",
        "ponente": "Speaker",
        "bio": "Bio",
        "tipo": "Type",
        "tema": "Topic",
        "nivel_tecnico": "Tech Level",
        "fecha": "Date",
        "youtube": "YouTube",
        # Shared task fields
        "conference_name": "Conference",
        "workshop_date": "Workshop Date",
        "registration_deadline": "Registration",
        "data_available_date": "Data Available",
        "submission_deadline": "Submission",
        "more_info_url": "More Info",
        # Initiative fields
        "type": "Type",
    }


def format_dataframe_for_display(df, url_columns=None, hide_columns=None):
    """
    Format a DataFrame for better display in Gradio tables with clickable URLs.

    Args:
        df: Pandas DataFrame
        url_columns: List of column names that contain URLs
        hide_columns: List of column names to hide

    Returns:
        Formatted DataFrame
    """
    if df.empty:
        return df

    # Make a copy to avoid modifying original
    display_df = df.copy()

    # Hide specified columns
    if hide_columns:
        display_df = display_df.drop(
            columns=[col for col in hide_columns if col in display_df.columns]
        )

    # Format URL columns with clickable links
    if url_columns:
        for col in url_columns:
            if col in display_df.columns:
                display_df[col] = display_df[col].apply(
                    lambda x: (
                        make_url_clickable(x) if pd.notna(x) and str(x).strip() else ""
                    )
                )

    # Ensure first column content doesn't wrap (for name/title columns)
    first_col = display_df.columns[0] if len(display_df.columns) > 0 else None
    if first_col:
        # Keep full text but ensure it displays in a single line (no wrapping)
        # Replace line breaks and excessive whitespace to ensure single line display
        display_df[first_col] = display_df[first_col].apply(
            lambda x: str(x).replace("\n", " ").replace("\r", " ").strip() if x else ""
        )

    # Rename columns to pretty names
    column_names = get_column_display_names()
    display_df = display_df.rename(columns=column_names)

    return display_df


def format_dataframe_for_html_display(df, url_columns=None, hide_columns=None):
    """
    Format a DataFrame for HTML display with clickable links.

    Args:
        df: Pandas DataFrame
        url_columns: List of column names that contain URLs
        hide_columns: List of column names to hide

    Returns:
        HTML string representation of the DataFrame
    """
    if df.empty:
        return "<p>No data available</p>"

    # Make a copy to avoid modifying original
    display_df = df.copy()

    # Hide specified columns
    if hide_columns:
        display_df = display_df.drop(
            columns=[col for col in hide_columns if col in display_df.columns]
        )

    # Format URL columns with clickable links
    if url_columns:
        for col in url_columns:
            if col in display_df.columns:
                display_df[col] = display_df[col].apply(
                    lambda x: (
                        make_url_clickable(x) if pd.notna(x) and str(x).strip() else ""
                    )
                )

    # Ensure first column content doesn't wrap (for name/title columns)
    first_col = display_df.columns[0] if len(display_df.columns) > 0 else None
    if first_col:
        # Keep full text but ensure it displays in a single line (no wrapping)
        # Replace line breaks and excessive whitespace to ensure single line display
        display_df[first_col] = display_df[first_col].apply(
            lambda x: str(x).replace("\n", " ").replace("\r", " ").strip() if x else ""
        )

    # Rename columns to pretty names
    column_names = get_column_display_names()
    display_df = display_df.rename(columns=column_names)

    # Convert to HTML with custom styling
    html = display_df.to_html(
        escape=False,  # Allow HTML in cells
        index=False,  # Don't show row indices
        classes="dataframe-table",
        table_id="resources-table",
    )

    # Add custom CSS styling
    styled_html = f"""
    <style>
    .dataframe-table {{
        border-collapse: collapse;
        margin: 25px 0;
        font-size: 0.9em;
        font-family: sans-serif;
        min-width: 400px;
        box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
        width: 100%;
    }}
    .dataframe-table thead tr {{
        background-color: #009879;
        color: #ffffff;
        text-align: left;
    }}
    .dataframe-table th,
    .dataframe-table td {{
        padding: 12px 15px;
        border: 1px solid #dddddd;
    }}
    .dataframe-table tbody tr {{
        border-bottom: 1px solid #dddddd;
    }}
    .dataframe-table tbody tr:nth-of-type(even) {{
        background-color: #f3f3f3;
    }}
    .dataframe-table tbody tr:hover {{
        background-color: #f5f5f5;
    }}
    .dataframe-table a {{
        color: #009879;
        text-decoration: none;
    }}
    .dataframe-table a:hover {{
        text-decoration: underline;
    }}
    </style>
    {html}
    """

    return styled_html