""" Shared constants for the NLP resources application. Contains lists of countries, languages, domains, tasks, and utility functions. """ import re from urllib.parse import urlparse import pandas as pd # Countries where Spanish is spoken COUNTRIES = [ "Spain", "Mexico", "Argentina", "Colombia", "Peru", "Venezuela", "Chile", "Ecuador", "Guatemala", "Cuba", "Bolivia", "Dominican Republic", "Honduras", "Paraguay", "El Salvador", "Nicaragua", "Costa Rica", "Panama", "Uruguay", "Puerto Rico", "Portugal", "Brazil", "International", ] # Languages LANGUAGES = [ "spanish", "catalan", "basque", "galician", "guarani", "quechua", "aymara", "nauhatl", "mapudungun", "spain", "latam", "portuguese", "all", ] # NLP tasks TASKS = [ "text classification", "sentiment analysis", "named entity recognition", "part-of-speech tagging", "question answering", "text summarization", "machine translation", "language modeling", "text generation", "information extraction", "semantic similarity", "natural language inference", ] # Domains for datasets DOMAINS = [ "clinical", "legal", "financial", "scientific", "news", "social media", "literature", "general", "academic", ] # Dataset types DATASET_TYPES = ["pretraining", "benchmark", "supervised fine-tuning", "alignment"] # Event types EVENT_TYPES = ["workshop", "talk", "AMA", "round table"] # Initiative types INITIATIVE_TYPES = [ "project", "event", "research group", "community", "research institute", "non-profit", "OS company", ] # Technical levels (for events) TECHNICAL_LEVELS = ["1", "2", "3", "4", "5"] def format_url_for_display(url: str) -> str: """ Format URL for display in tables - show only the meaningful part. Args: url: Full URL string Returns: Shortened, readable version of the URL """ if not url or not url.strip(): return "" url = url.strip() # Remove protocol if url.startswith(("http://", "https://")): url = url.split("://", 1)[1] # Special handling for common domains if "huggingface.co" in url: # Extract the meaningful part after huggingface.co if "/datasets/" in url: return url.split("/datasets/")[-1] elif "/models/" in url: return url.split("/models/")[-1] elif "/collections/" in url: return url.split("/collections/")[-1] else: # Return everything after huggingface.co/ parts = url.split("huggingface.co/") return parts[-1] if len(parts) > 1 else url elif "github.com" in url: # Extract repo name (owner/repo) parts = url.split("github.com/") if len(parts) > 1: repo_path = parts[-1].split("/") if len(repo_path) >= 2: return f"{repo_path[0]}/{repo_path[1]}" return url elif "zenodo.org" in url: # Extract record ID if "/record/" in url: return f"zenodo:{url.split('/record/')[-1].split('/')[0]}" return url elif "arxiv.org" in url: # Extract arXiv ID if "/abs/" in url: return f"arXiv:{url.split('/abs/')[-1]}" elif "/pdf/" in url: return f"arXiv:{url.split('/pdf/')[-1].replace('.pdf', '')}" return url elif "youtube.com" in url or "youtu.be" in url: # Extract video ID or title if available if "watch?v=" in url: video_id = url.split("watch?v=")[-1].split("&")[0] return f"YouTube:{video_id[:8]}..." elif "youtu.be/" in url: video_id = url.split("youtu.be/")[-1].split("?")[0] return f"YouTube:{video_id[:8]}..." return url else: # For other URLs, try to extract domain and path try: parsed = urlparse( f"https://{url}" if not url.startswith(("http://", "https://")) else url ) domain = parsed.netloc path = parsed.path.strip("/") if path: # Show domain + first part of path path_parts = path.split("/") if len(path_parts) > 0 and path_parts[0]: return f"{domain}/{path_parts[0]}" return domain except: # Fallback: limit length return url[:30] + "..." if len(url) > 30 else url def make_url_clickable(url: str, display_text: str = None) -> str: """ Convert URL to clickable HTML link. Args: url: Full URL display_text: Text to display for the link (optional) Returns: HTML link string """ # Handle non-string types (like float NaN values) if url is None or (isinstance(url, float) and pd.isna(url)): return "" url = str(url).strip() if not url or url.lower() in ["nan", "none", ""]: return "" # Ensure URL has protocol if not url.startswith(("http://", "https://")): url = f"https://{url}" # Use provided display text or format the URL text = display_text if display_text else format_url_for_display(url) return f'{text}' def get_column_display_names(): """ Return mapping of column names to pretty display names. Returns: Dictionary mapping column names to display names """ return { # Common fields "name": "Name", "submitted_by": "Submitted By", "date_submitted": "Date Submitted", # Dataset fields "github_url": "GitHub", "huggingface_url": "HF Dataset", "zenodo_url": "Zenodo", "paper_url": "Paper", "website_url": "Website", "dataset_type": "Type", "task": "Tasks", "domain": "Domain", "countries": "Countries", "languages": "Languages", # Model fields "familia": "Family", "available_sizes": "Sizes (B)", "hf_collection_url": "HF Collection", # Event fields "titulo": "Title", "ponente": "Speaker", "bio": "Bio", "tipo": "Type", "tema": "Topic", "nivel_tecnico": "Tech Level", "fecha": "Date", "youtube": "YouTube", # Shared task fields "conference_name": "Conference", "workshop_date": "Workshop Date", "registration_deadline": "Registration", "data_available_date": "Data Available", "submission_deadline": "Submission", "more_info_url": "More Info", # Initiative fields "type": "Type", } def format_dataframe_for_display(df, url_columns=None, hide_columns=None): """ Format a DataFrame for better display in Gradio tables with clickable URLs. Args: df: Pandas DataFrame url_columns: List of column names that contain URLs hide_columns: List of column names to hide Returns: Formatted DataFrame """ if df.empty: return df # Make a copy to avoid modifying original display_df = df.copy() # Hide specified columns if hide_columns: display_df = display_df.drop( columns=[col for col in hide_columns if col in display_df.columns] ) # Format URL columns with clickable links if url_columns: for col in url_columns: if col in display_df.columns: display_df[col] = display_df[col].apply( lambda x: ( make_url_clickable(x) if pd.notna(x) and str(x).strip() else "" ) ) # Ensure first column content doesn't wrap (for name/title columns) first_col = display_df.columns[0] if len(display_df.columns) > 0 else None if first_col: # Keep full text but ensure it displays in a single line (no wrapping) # Replace line breaks and excessive whitespace to ensure single line display display_df[first_col] = display_df[first_col].apply( lambda x: str(x).replace("\n", " ").replace("\r", " ").strip() if x else "" ) # Rename columns to pretty names column_names = get_column_display_names() display_df = display_df.rename(columns=column_names) return display_df def format_dataframe_for_html_display(df, url_columns=None, hide_columns=None): """ Format a DataFrame for HTML display with clickable links. Args: df: Pandas DataFrame url_columns: List of column names that contain URLs hide_columns: List of column names to hide Returns: HTML string representation of the DataFrame """ if df.empty: return "
No data available
" # Make a copy to avoid modifying original display_df = df.copy() # Hide specified columns if hide_columns: display_df = display_df.drop( columns=[col for col in hide_columns if col in display_df.columns] ) # Format URL columns with clickable links if url_columns: for col in url_columns: if col in display_df.columns: display_df[col] = display_df[col].apply( lambda x: ( make_url_clickable(x) if pd.notna(x) and str(x).strip() else "" ) ) # Ensure first column content doesn't wrap (for name/title columns) first_col = display_df.columns[0] if len(display_df.columns) > 0 else None if first_col: # Keep full text but ensure it displays in a single line (no wrapping) # Replace line breaks and excessive whitespace to ensure single line display display_df[first_col] = display_df[first_col].apply( lambda x: str(x).replace("\n", " ").replace("\r", " ").strip() if x else "" ) # Rename columns to pretty names column_names = get_column_display_names() display_df = display_df.rename(columns=column_names) # Convert to HTML with custom styling html = display_df.to_html( escape=False, # Allow HTML in cells index=False, # Don't show row indices classes="dataframe-table", table_id="resources-table", ) # Add custom CSS styling styled_html = f""" {html} """ return styled_html