Spaces:
Sleeping
Sleeping
| """ | |
| Shared constants for the NLP resources application. | |
| Contains lists of countries, languages, domains, tasks, and utility functions. | |
| """ | |
| import re | |
| from urllib.parse import urlparse | |
| import pandas as pd | |
| # Countries where Spanish is spoken | |
| COUNTRIES = [ | |
| "Spain", | |
| "Mexico", | |
| "Argentina", | |
| "Colombia", | |
| "Peru", | |
| "Venezuela", | |
| "Chile", | |
| "Ecuador", | |
| "Guatemala", | |
| "Cuba", | |
| "Bolivia", | |
| "Dominican Republic", | |
| "Honduras", | |
| "Paraguay", | |
| "El Salvador", | |
| "Nicaragua", | |
| "Costa Rica", | |
| "Panama", | |
| "Uruguay", | |
| "Puerto Rico", | |
| "Portugal", | |
| "Brazil", | |
| "International", | |
| ] | |
| # Languages | |
| LANGUAGES = [ | |
| "spanish", | |
| "catalan", | |
| "basque", | |
| "galician", | |
| "guarani", | |
| "quechua", | |
| "aymara", | |
| "nauhatl", | |
| "mapudungun", | |
| "spain", | |
| "latam", | |
| "portuguese", | |
| "all", | |
| ] | |
| # NLP tasks | |
| TASKS = [ | |
| "text classification", | |
| "sentiment analysis", | |
| "named entity recognition", | |
| "part-of-speech tagging", | |
| "question answering", | |
| "text summarization", | |
| "machine translation", | |
| "language modeling", | |
| "text generation", | |
| "information extraction", | |
| "semantic similarity", | |
| "natural language inference", | |
| ] | |
| # Domains for datasets | |
| DOMAINS = [ | |
| "clinical", | |
| "legal", | |
| "financial", | |
| "scientific", | |
| "news", | |
| "social media", | |
| "literature", | |
| "general", | |
| "academic", | |
| ] | |
| # Dataset types | |
| DATASET_TYPES = ["pretraining", "benchmark", "supervised fine-tuning", "alignment"] | |
| # Event types | |
| EVENT_TYPES = ["workshop", "talk", "AMA", "round table"] | |
| # Initiative types | |
| INITIATIVE_TYPES = [ | |
| "project", | |
| "event", | |
| "research group", | |
| "community", | |
| "research institute", | |
| "non-profit", | |
| "OS company", | |
| ] | |
| # Technical levels (for events) | |
| TECHNICAL_LEVELS = ["1", "2", "3", "4", "5"] | |
| def format_url_for_display(url: str) -> str: | |
| """ | |
| Format URL for display in tables - show only the meaningful part. | |
| Args: | |
| url: Full URL string | |
| Returns: | |
| Shortened, readable version of the URL | |
| """ | |
| if not url or not url.strip(): | |
| return "" | |
| url = url.strip() | |
| # Remove protocol | |
| if url.startswith(("http://", "https://")): | |
| url = url.split("://", 1)[1] | |
| # Special handling for common domains | |
| if "huggingface.co" in url: | |
| # Extract the meaningful part after huggingface.co | |
| if "/datasets/" in url: | |
| return url.split("/datasets/")[-1] | |
| elif "/models/" in url: | |
| return url.split("/models/")[-1] | |
| elif "/collections/" in url: | |
| return url.split("/collections/")[-1] | |
| else: | |
| # Return everything after huggingface.co/ | |
| parts = url.split("huggingface.co/") | |
| return parts[-1] if len(parts) > 1 else url | |
| elif "github.com" in url: | |
| # Extract repo name (owner/repo) | |
| parts = url.split("github.com/") | |
| if len(parts) > 1: | |
| repo_path = parts[-1].split("/") | |
| if len(repo_path) >= 2: | |
| return f"{repo_path[0]}/{repo_path[1]}" | |
| return url | |
| elif "zenodo.org" in url: | |
| # Extract record ID | |
| if "/record/" in url: | |
| return f"zenodo:{url.split('/record/')[-1].split('/')[0]}" | |
| return url | |
| elif "arxiv.org" in url: | |
| # Extract arXiv ID | |
| if "/abs/" in url: | |
| return f"arXiv:{url.split('/abs/')[-1]}" | |
| elif "/pdf/" in url: | |
| return f"arXiv:{url.split('/pdf/')[-1].replace('.pdf', '')}" | |
| return url | |
| elif "youtube.com" in url or "youtu.be" in url: | |
| # Extract video ID or title if available | |
| if "watch?v=" in url: | |
| video_id = url.split("watch?v=")[-1].split("&")[0] | |
| return f"YouTube:{video_id[:8]}..." | |
| elif "youtu.be/" in url: | |
| video_id = url.split("youtu.be/")[-1].split("?")[0] | |
| return f"YouTube:{video_id[:8]}..." | |
| return url | |
| else: | |
| # For other URLs, try to extract domain and path | |
| try: | |
| parsed = urlparse( | |
| f"https://{url}" if not url.startswith(("http://", "https://")) else url | |
| ) | |
| domain = parsed.netloc | |
| path = parsed.path.strip("/") | |
| if path: | |
| # Show domain + first part of path | |
| path_parts = path.split("/") | |
| if len(path_parts) > 0 and path_parts[0]: | |
| return f"{domain}/{path_parts[0]}" | |
| return domain | |
| except: | |
| # Fallback: limit length | |
| return url[:30] + "..." if len(url) > 30 else url | |
| def make_url_clickable(url: str, display_text: str = None) -> str: | |
| """ | |
| Convert URL to clickable HTML link. | |
| Args: | |
| url: Full URL | |
| display_text: Text to display for the link (optional) | |
| Returns: | |
| HTML link string | |
| """ | |
| # Handle non-string types (like float NaN values) | |
| if url is None or (isinstance(url, float) and pd.isna(url)): | |
| return "" | |
| url = str(url).strip() | |
| if not url or url.lower() in ["nan", "none", ""]: | |
| return "" | |
| # Ensure URL has protocol | |
| if not url.startswith(("http://", "https://")): | |
| url = f"https://{url}" | |
| # Use provided display text or format the URL | |
| text = display_text if display_text else format_url_for_display(url) | |
| return f'<a target="_blank" href="{url}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">{text}</a>' | |
| def get_column_display_names(): | |
| """ | |
| Return mapping of column names to pretty display names. | |
| Returns: | |
| Dictionary mapping column names to display names | |
| """ | |
| return { | |
| # Common fields | |
| "name": "Name", | |
| "submitted_by": "Submitted By", | |
| "date_submitted": "Date Submitted", | |
| # Dataset fields | |
| "github_url": "GitHub", | |
| "huggingface_url": "HF Dataset", | |
| "zenodo_url": "Zenodo", | |
| "paper_url": "Paper", | |
| "website_url": "Website", | |
| "dataset_type": "Type", | |
| "task": "Tasks", | |
| "domain": "Domain", | |
| "countries": "Countries", | |
| "languages": "Languages", | |
| # Model fields | |
| "familia": "Family", | |
| "available_sizes": "Sizes (B)", | |
| "hf_collection_url": "HF Collection", | |
| # Event fields | |
| "titulo": "Title", | |
| "ponente": "Speaker", | |
| "bio": "Bio", | |
| "tipo": "Type", | |
| "tema": "Topic", | |
| "nivel_tecnico": "Tech Level", | |
| "fecha": "Date", | |
| "youtube": "YouTube", | |
| # Shared task fields | |
| "conference_name": "Conference", | |
| "workshop_date": "Workshop Date", | |
| "registration_deadline": "Registration", | |
| "data_available_date": "Data Available", | |
| "submission_deadline": "Submission", | |
| "more_info_url": "More Info", | |
| # Initiative fields | |
| "type": "Type", | |
| } | |
| def format_dataframe_for_display(df, url_columns=None, hide_columns=None): | |
| """ | |
| Format a DataFrame for better display in Gradio tables with clickable URLs. | |
| Args: | |
| df: Pandas DataFrame | |
| url_columns: List of column names that contain URLs | |
| hide_columns: List of column names to hide | |
| Returns: | |
| Formatted DataFrame | |
| """ | |
| if df.empty: | |
| return df | |
| # Make a copy to avoid modifying original | |
| display_df = df.copy() | |
| # Hide specified columns | |
| if hide_columns: | |
| display_df = display_df.drop( | |
| columns=[col for col in hide_columns if col in display_df.columns] | |
| ) | |
| # Format URL columns with clickable links | |
| if url_columns: | |
| for col in url_columns: | |
| if col in display_df.columns: | |
| display_df[col] = display_df[col].apply( | |
| lambda x: ( | |
| make_url_clickable(x) if pd.notna(x) and str(x).strip() else "" | |
| ) | |
| ) | |
| # Ensure first column content doesn't wrap (for name/title columns) | |
| first_col = display_df.columns[0] if len(display_df.columns) > 0 else None | |
| if first_col: | |
| # Keep full text but ensure it displays in a single line (no wrapping) | |
| # Replace line breaks and excessive whitespace to ensure single line display | |
| display_df[first_col] = display_df[first_col].apply( | |
| lambda x: str(x).replace("\n", " ").replace("\r", " ").strip() if x else "" | |
| ) | |
| # Rename columns to pretty names | |
| column_names = get_column_display_names() | |
| display_df = display_df.rename(columns=column_names) | |
| return display_df | |
| def format_dataframe_for_html_display(df, url_columns=None, hide_columns=None): | |
| """ | |
| Format a DataFrame for HTML display with clickable links. | |
| Args: | |
| df: Pandas DataFrame | |
| url_columns: List of column names that contain URLs | |
| hide_columns: List of column names to hide | |
| Returns: | |
| HTML string representation of the DataFrame | |
| """ | |
| if df.empty: | |
| return "<p>No data available</p>" | |
| # Make a copy to avoid modifying original | |
| display_df = df.copy() | |
| # Hide specified columns | |
| if hide_columns: | |
| display_df = display_df.drop( | |
| columns=[col for col in hide_columns if col in display_df.columns] | |
| ) | |
| # Format URL columns with clickable links | |
| if url_columns: | |
| for col in url_columns: | |
| if col in display_df.columns: | |
| display_df[col] = display_df[col].apply( | |
| lambda x: ( | |
| make_url_clickable(x) if pd.notna(x) and str(x).strip() else "" | |
| ) | |
| ) | |
| # Ensure first column content doesn't wrap (for name/title columns) | |
| first_col = display_df.columns[0] if len(display_df.columns) > 0 else None | |
| if first_col: | |
| # Keep full text but ensure it displays in a single line (no wrapping) | |
| # Replace line breaks and excessive whitespace to ensure single line display | |
| display_df[first_col] = display_df[first_col].apply( | |
| lambda x: str(x).replace("\n", " ").replace("\r", " ").strip() if x else "" | |
| ) | |
| # Rename columns to pretty names | |
| column_names = get_column_display_names() | |
| display_df = display_df.rename(columns=column_names) | |
| # Convert to HTML with custom styling | |
| html = display_df.to_html( | |
| escape=False, # Allow HTML in cells | |
| index=False, # Don't show row indices | |
| classes="dataframe-table", | |
| table_id="resources-table", | |
| ) | |
| # Add custom CSS styling | |
| styled_html = f""" | |
| <style> | |
| .dataframe-table {{ | |
| border-collapse: collapse; | |
| margin: 25px 0; | |
| font-size: 0.9em; | |
| font-family: sans-serif; | |
| min-width: 400px; | |
| box-shadow: 0 0 20px rgba(0, 0, 0, 0.15); | |
| width: 100%; | |
| }} | |
| .dataframe-table thead tr {{ | |
| background-color: #009879; | |
| color: #ffffff; | |
| text-align: left; | |
| }} | |
| .dataframe-table th, | |
| .dataframe-table td {{ | |
| padding: 12px 15px; | |
| border: 1px solid #dddddd; | |
| }} | |
| .dataframe-table tbody tr {{ | |
| border-bottom: 1px solid #dddddd; | |
| }} | |
| .dataframe-table tbody tr:nth-of-type(even) {{ | |
| background-color: #f3f3f3; | |
| }} | |
| .dataframe-table tbody tr:hover {{ | |
| background-color: #f5f5f5; | |
| }} | |
| .dataframe-table a {{ | |
| color: #009879; | |
| text-decoration: none; | |
| }} | |
| .dataframe-table a:hover {{ | |
| text-decoration: underline; | |
| }} | |
| </style> | |
| {html} | |
| """ | |
| return styled_html | |