Spaces:
Running
Running
| from __future__ import annotations | |
| import zipfile | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from functools import partial | |
| from copy import deepcopy | |
| import website_texts | |
| import os | |
| import re | |
| from constants import model_type_emoji | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns | |
| from website_texts import ( | |
| ABOUT_TEXT, | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| INTRODUCTION_TEXT, | |
| TITLE, | |
| VERSION_HISTORY_BUTTON_TEXT, | |
| ) | |
| class LBContainer: | |
| name: str | |
| base_path_to_results: str | |
| blurb: str | None = None | |
| n_datasets: int | None = None | |
| def __post_init__(self): | |
| for fname in os.listdir(self._base_path): | |
| match = re.match(r"n_datasets_(.+)", fname) | |
| if match: | |
| self.n_datasets = match.group(1) | |
| break | |
| def _base_path(self): | |
| return Path(__file__).parent / "data" / self.base_path_to_results | |
| def load_df_leaderboard(self) -> pd.DataFrame: | |
| df = pd.read_csv(self._base_path.resolve() / "website_leaderboard.csv") | |
| df = df.rename(columns={"1#": "#"}) | |
| return df | |
| def _handle_img_zip(self, img_name: str) -> str: | |
| _base_path = self._base_path / img_name | |
| zip_path = _base_path.with_suffix(".png.zip") | |
| img_path = _base_path.with_suffix(".png") | |
| if img_path.exists(): | |
| return str(img_path) | |
| with zipfile.ZipFile(zip_path, "r") as zipf: | |
| zipf.extractall(img_path.parent) | |
| return str(img_path) | |
| def get_path_to_tuning_impact_elo(self) -> str: | |
| return self._handle_img_zip("tuning-impact-elo") | |
| def get_path_to_pareto_front_improvability_vs_time_infer(self) -> str: | |
| return self._handle_img_zip("pareto_front_improvability_vs_time_infer") | |
| def get_path_to_pareto_n_configs_imp(self) -> str: | |
| return self._handle_img_zip("pareto_n_configs_imp") | |
| def get_path_to_winrate_matrix(self) -> str: | |
| return self._handle_img_zip("winrate_matrix") | |
| def make_overview_images(lb: LBContainer, subset_name): | |
| # Main Figure | |
| gr.Image( | |
| lb.get_path_to_tuning_impact_elo(), | |
| label=f"Leaderboard Overview [{subset_name}]", | |
| show_label=True, | |
| height=500, | |
| show_share_button=True, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Image( | |
| value=lb.get_path_to_pareto_front_improvability_vs_time_infer(), | |
| label=f"Inference Time Pareto Front [{subset_name}]", | |
| height=400, | |
| show_label=True, | |
| show_share_button=True, | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Image( | |
| value=lb.get_path_to_pareto_n_configs_imp(), | |
| label=f"Tuning Trajectories [{subset_name}]", | |
| height=400, | |
| show_label=True, | |
| show_share_button=True, | |
| ) | |
| def make_overview_leaderboard(lbs: [LBContainer]): | |
| # Create column per LB | |
| all_models = { | |
| m.split("[")[0].strip() | |
| for lb in lbs | |
| for m in lb.df_leaderboard[ | |
| ~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"]) | |
| ]["Model"] | |
| .unique() | |
| .tolist() | |
| } | |
| full_df = None | |
| for lb in lbs: | |
| df = lb.df_leaderboard.copy() | |
| df = df[~df["TypeName"].isin(["Reference Pipeline"])] | |
| df[lb.name] = df["Elo [β¬οΈ]"].rank(ascending=False, method="first").astype(int) | |
| df = df.sort_values(by=lb.name, ascending=True) | |
| df = df[["Type", "Model", lb.name]] | |
| # Remove imputed message. | |
| df["Model"] = ( | |
| df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string") | |
| ) | |
| if full_df is None: | |
| # TODO: add support in case a model did not run on the full LB. | |
| assert all_models.difference(set(df["Model"].unique())) == set() | |
| full_df = df | |
| else: | |
| df = df[["Model", lb.name]] | |
| df_models = set(df["Model"].unique()) | |
| missing_models = all_models.difference(df_models) | |
| if missing_models: | |
| missing_models_df = pd.DataFrame( | |
| [[mm, "--"] for mm in missing_models], | |
| columns=["Model", lb.name], | |
| ) | |
| df = pd.concat([df, missing_models_df], ignore_index=True) | |
| df["Model"] = df["Model"].astype("string") | |
| # Merge | |
| full_df = full_df.merge(df, how="left", on="Model", validate="1:1") | |
| medal_colors = ["#998A00", "#808080", "#8C5520"] | |
| # Highlight function | |
| def highlight_top3(col): | |
| styles = [""] * len(col) | |
| for index_i in range(len(col)): | |
| if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3: | |
| styles[index_i] = ( | |
| f"background-color: {medal_colors[col.iloc[index_i] - 1]};" | |
| ) | |
| return styles | |
| styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs]) | |
| return gr.DataFrame( | |
| styler, | |
| pinned_columns=2, | |
| interactive=False, | |
| show_search="search", | |
| label="The ranking of all models (with imputation) across various leaderboards.", | |
| ) | |
| def make_leaderboard(lb: LBContainer) -> Leaderboard: | |
| df_leaderboard = lb.load_df_leaderboard() | |
| # -- Add filters | |
| df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply( | |
| lambda m: f"{m} {model_type_emoji[m]}" | |
| ) | |
| df_leaderboard["Only Default"] = df_leaderboard["Model"].str.contains( | |
| "(default)", regex=False | |
| ) | |
| df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.contains( | |
| "(tuned)", regex=False | |
| ) | |
| df_leaderboard["Only Tuned + Ensembled"] = df_leaderboard["Model"].str.contains( | |
| r"(tuned + ensembled)", regex=False | |
| ) | df_leaderboard["Model"].str.contains(r"AutoGluon", regex=False) | |
| filter_columns = [ | |
| ColumnFilter("TypeFiler", type="checkboxgroup", label="π€ Model Types"), | |
| ColumnFilter("Only Default", type="boolean", default=False), | |
| ColumnFilter("Only Tuned", type="boolean", default=False), | |
| ColumnFilter("Only Tuned + Ensembled", type="boolean", default=False), | |
| ] | |
| datatypes = [] | |
| for s in df_leaderboard.T.values: | |
| dt = s.dtype | |
| if dt == bool: | |
| datatypes.append("bool") | |
| elif np.issubdtype(dt, np.number): | |
| datatypes.append("number") | |
| else: | |
| datatypes.append("markdown") | |
| # Add Imputed count postfix | |
| if any(df_leaderboard["Imputed"]): | |
| df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace( | |
| { | |
| True: "Imputed", | |
| False: "Not Imputed", | |
| } | |
| ) | |
| datatypes.append("bool") | |
| filter_columns.append( | |
| ColumnFilter( | |
| "Imputed", | |
| type="checkboxgroup", | |
| label="(Not) Imputed Models", | |
| info="We impute the performance for models that cannot run on all" | |
| " datasets due to task or dataset size constraints. We impute with" | |
| " the performance of a default RandomForest." | |
| " We add a postfix [X% IMPUTED] to the model if any results were" | |
| " imputed. The X% shows the percentage of" | |
| " datasets that were imputed. In general, imputation negatively" | |
| " represents the model performance, punishing the model for not" | |
| " being able to run on all datasets.", | |
| ) | |
| ) | |
| else: | |
| df_leaderboard = df_leaderboard.drop(columns=["Imputed (%) [β¬οΈ]"]) | |
| return Leaderboard( | |
| # label=f"Full Leaderboard [{lb.name}]", | |
| elem_id=f"lb_for_{lb.name}", | |
| value=df_leaderboard, | |
| datatype=datatypes, | |
| select_columns=SelectColumns( | |
| default_selection=list(df_leaderboard.columns), | |
| cant_deselect=["Type", "Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| hide_columns=[ | |
| "TypeName", | |
| "TypeFiler", | |
| "RefModel", | |
| "Only Default", | |
| "Only Tuned", | |
| "Only Tuned + Ensembled", | |
| "Imputed", | |
| ], | |
| search_columns=["Model", "TypeName"], | |
| filter_columns=filter_columns, | |
| bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):", | |
| height=800, | |
| ) | |
| class LBMatrixElement: | |
| imputation: str | |
| splits: str | |
| tasks: str | |
| datasets: str | |
| def get_path_to_results(self) -> str: | |
| return ( | |
| f"imputation_{self.imputation}/" | |
| f"splits_{self.splits}/" | |
| f"tasks_{self.tasks}/" | |
| f"datasets_{self.datasets}/" | |
| ) | |
| class LBMatrix: | |
| imputation = ["no", "yes"] | |
| splits = ["all", "lite"] | |
| tasks = ["all", "classification", "regression"] | |
| datasets = ["all", "small", "medium", "tabpfn"] | |
| def get_name_for_lb(lb_key, lb_value): | |
| if lb_key == "imputation": | |
| return ( | |
| "Models (w/o imputation)" | |
| if lb_value == "no" | |
| else "πΉ Models (with imputation)" | |
| ) | |
| if lb_key == "splits": | |
| return "All Repeats" if lb_value == "all" else "Lite" | |
| if lb_key == "tasks": | |
| match lb_value: | |
| case "all": | |
| return "All Tasks" | |
| case "classification": | |
| return "Classification" | |
| case "regression": | |
| return "Regression" | |
| case _: | |
| raise ValueError() | |
| if lb_key == "datasets": | |
| match lb_value: | |
| case "all": | |
| return "All Datasets" | |
| case "small": | |
| return "Small" | |
| case "medium": | |
| return "Medium" | |
| case "tabpfn": | |
| return "πΈ TabPFNv2-data" | |
| case _: | |
| raise ValueError() | |
| raise ValueError() | |
| def element_to_blurb(self, element: LBMatrixElement, n_datasets: int) -> str: | |
| datasets_name = ( | |
| element.datasets if element.datasets != "tabpfn" else "TabPFNv2-compatible" | |
| ) | |
| blurb = f"Leaderboard for {n_datasets} datasets ({datasets_name} datasets, {element.tasks} tasks) " | |
| if element.splits == "lite": | |
| blurb += "for one split (1st fold, 1st repeat) " | |
| blurb += "including all " | |
| if element.imputation == "yes": | |
| blurb += "(imputed) " | |
| blurb += f"models." | |
| if datasets_name == "small": | |
| blurb += "<br>Small datasets contain between 500 and 10,000 samples." | |
| elif datasets_name == "medium": | |
| blurb += "<br>Medium datasets contain between 10,000 and 250,000 samples." | |
| elif datasets_name == "TabPFNv2-compatible": | |
| blurb += "<br>TabPFNv2-compatible datasets contain at most 10,000 samples, 500 features, and 10 classes." | |
| return blurb | |
| def render_details(imputation, splits, tasks, datasets, lb_matrix): | |
| """ | |
| Renders the heavy content (images, dataframes). | |
| """ | |
| impute_t_name = lb_matrix.get_name_for_lb("imputation", imputation) | |
| splits_t = lb_matrix.get_name_for_lb("splits", splits) | |
| tasks_t_name = lb_matrix.get_name_for_lb("tasks", tasks) | |
| datasets_t_name = lb_matrix.get_name_for_lb("datasets", datasets) | |
| lb_element = LBMatrixElement( | |
| imputation=imputation, | |
| splits=splits, | |
| tasks=tasks, | |
| datasets=datasets, | |
| ) | |
| lb = LBContainer( | |
| name=f"{impute_t_name} | {splits_t} | {tasks_t_name} | {datasets_t_name}", | |
| base_path_to_results=lb_element.get_path_to_results(), | |
| ) | |
| lb.blurb = lb_matrix.element_to_blurb( | |
| lb_element, | |
| n_datasets=lb.n_datasets, | |
| ) | |
| gr.Markdown( | |
| lb.blurb, | |
| elem_classes="markdown-text", | |
| ) | |
| make_overview_images(lb, subset_name=lb.name) | |
| # Render Leaderboard safely | |
| with gr.Group(): | |
| gr.Markdown( | |
| "## β Full Leaderboard Table", | |
| elem_classes="markdown-text", | |
| ) | |
| make_leaderboard(lb) | |
| gr.Image( | |
| lb.get_path_to_winrate_matrix(), | |
| label=f"Winmatrix Overview [{lb.name}]", | |
| show_label=True, | |
| height=800, | |
| show_share_button=True, | |
| ) | |
| def render_func(evt: gr.SelectData): | |
| print(f"Tab Selected: {evt.value} (Type: {evt.index})") | |
| def main(): | |
| css = """ | |
| .markdown-text-box { | |
| padding: 4px; | |
| border-radius: 2px; | |
| } | |
| .tab-buttons { | |
| margin-top: -14px !important; | |
| margin-bottom: -14px !important; | |
| } | |
| """ | |
| js_func = """ | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'dark') { | |
| url.searchParams.set('__theme', 'dark'); | |
| window.location.href = url.href; | |
| } | |
| } | |
| """ | |
| with gr.Blocks(css=css, js=js_func, title="TabArena") as website: | |
| gr.HTML(TITLE) | |
| # -- Introduction | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(), gr.Accordion("π Datasets", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Column(), gr.Accordion("π€ Models", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(), gr.Accordion("π Metrics, Imputation, Repeats", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Column(), gr.Accordion("π Reference Pipelines", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Row(), gr.Accordion("π About", open=False): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box") | |
| with gr.Row(), gr.Accordion("π Citation", open=False): | |
| gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=7, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| gr.Markdown("## π TabArena Leaderboards") | |
| gr.Markdown("Change the filters below to compare models with our without imputation across repeats, tasks, and dataset subsets.") | |
| gr.Markdown("") | |
| lb_matrix = LBMatrix() | |
| impute_state = gr.State(lb_matrix.imputation[0]) | |
| splits_state = gr.State(lb_matrix.splits[0]) | |
| tasks_state = gr.State(lb_matrix.tasks[0]) | |
| datasets_state = gr.State(lb_matrix.datasets[0]) | |
| # Impute | |
| with gr.Tabs(elem_classes="tab-buttons") as impute_tabs: | |
| for impute_t in lb_matrix.imputation: | |
| with gr.TabItem( | |
| lb_matrix.get_name_for_lb("imputation", impute_t), | |
| id=impute_t, | |
| ) as t_impute: | |
| t_impute.select(lambda x=impute_t: x, outputs=impute_state) | |
| # Splits | |
| with gr.Tabs(elem_classes="tab-buttons") as split_tabs: | |
| for splits_t in lb_matrix.splits: | |
| with gr.TabItem( | |
| lb_matrix.get_name_for_lb("splits", splits_t), | |
| id=f"{impute_t}_{splits_t}", | |
| ) as t_splits: | |
| t_splits.select(lambda x=splits_t: x, outputs=splits_state) | |
| # Tasks | |
| with gr.Tabs(elem_classes="tab-buttons") as task_tabs: | |
| for tasks_t in lb_matrix.tasks: | |
| with gr.TabItem( | |
| lb_matrix.get_name_for_lb("tasks", tasks_t), | |
| id=f"{impute_t}_{splits_t}_{tasks_t}", | |
| ) as t_tasks: | |
| t_tasks.select(lambda x=tasks_t: x, outputs=tasks_state) | |
| # Datasets | |
| with gr.Tabs(elem_classes="tab-buttons") as dataset_tabs: | |
| for datasets_t in lb_matrix.datasets: | |
| with gr.TabItem( | |
| lb_matrix.get_name_for_lb("datasets", datasets_t), | |
| id=f"{impute_t}_{splits_t}_{tasks_t}_{datasets_t}", | |
| ) as t_dataset: | |
| t_dataset.select( | |
| lambda x=datasets_t: x, | |
| outputs=datasets_state, | |
| ) | |
| with gr.Column(): | |
| def reactive_render(sel_i, sel_s, sel_t, sel_d): | |
| render_details( | |
| imputation=sel_i, | |
| splits=sel_s, | |
| tasks=sel_t, | |
| datasets=sel_d, | |
| lb_matrix=lb_matrix, | |
| ) | |
| with gr.Row(), gr.Accordion("π Version History", open=False): | |
| gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text") | |
| website.launch(show_error=True, ssr_mode=False, debug=True) | |
| if __name__ == "__main__": | |
| main() |