Spaces:

TabArena
/

leaderboard

Running

App Files Files Community

LennartPurucker commited on 13 days ago

Commit

e6fcd81

1 Parent(s): 3a4d296

fix filters

Browse files

Files changed (2) hide show

main.py +16 -4
website_texts.py +25 -18

main.py CHANGED Viewed

@@ -224,6 +224,8 @@ def make_leaderboard(lb: LBContainer) -> Leaderboard:
                 " being able to run on all datasets.",
             )
         )
     return Leaderboard(
         # label=f"Full Leaderboard [{lb.name}]",
@@ -280,7 +282,7 @@ class LBMatrix:
             return (
                 "Models (w/o imputation)"
                 if lb_value == "no"
-                else "Models (with imputation)"
             )
         if lb_key == "splits":
             return "All Repeats" if lb_value == "all" else "Lite"
@@ -303,7 +305,7 @@ class LBMatrix:
                 case "medium":
                     return "Medium"
                 case "tabpfn":
-                    return "TabPFNv2-data"
                 case _:
                     raise ValueError()
         raise ValueError()
@@ -321,6 +323,14 @@ class LBMatrix:
         if element.imputation == "yes":
             blurb += "(imputed) "
         blurb += f"models."
         return blurb
@@ -414,7 +424,7 @@ def main():
                     website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
                 )
         with gr.Row():
-            with gr.Column(), gr.Accordion("📈 Metrics", open=False):
                 gr.Markdown(
                     website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
                 )
@@ -422,7 +432,7 @@ def main():
                 gr.Markdown(
                     website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
                 )
-        with gr.Row(), gr.Accordion("📝 More Details", open=False):
             gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
         with gr.Row(), gr.Accordion("📙 Citation", open=False):
             gr.Textbox(
@@ -434,6 +444,8 @@ def main():
             )
         gr.Markdown("## 🏆 TabArena Leaderboards")
         lb_matrix = LBMatrix()
         impute_state = gr.State(lb_matrix.imputation[0])

                 " being able to run on all datasets.",
             )
         )
+    else:
+        df_leaderboard = df_leaderboard.drop(columns=["Imputed (%) [⬇️]"])
     return Leaderboard(
         # label=f"Full Leaderboard [{lb.name}]",
             return (
                 "Models (w/o imputation)"
                 if lb_value == "no"
+                else "🔹 Models (with imputation)"
             )
         if lb_key == "splits":
             return "All Repeats" if lb_value == "all" else "Lite"
                 case "medium":
                     return "Medium"
                 case "tabpfn":
+                    return "🔸 TabPFNv2-data"
                 case _:
                     raise ValueError()
         raise ValueError()
         if element.imputation == "yes":
             blurb += "(imputed) "
         blurb += f"models."
+        if datasets_name == "small":
+            blurb += "<br>Small datasets contain between 500 and 10,000 samples."
+        elif datasets_name == "medium":
+            blurb += "<br>Medium datasets contain between 10,000 and 250,000 samples."
+        elif datasets_name == "TabPFNv2-compatible":
+            blurb += "<br>TabPFNv2-compatible datasets contain at most 10,000 samples, 500 features, and 10 classes."
         return blurb
                     website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
                 )
         with gr.Row():
+            with gr.Column(), gr.Accordion("📈 Metrics, Imputation, Repeats", open=False):
                 gr.Markdown(
                     website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
                 )
                 gr.Markdown(
                     website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
                 )
+        with gr.Row(), gr.Accordion("📝 About", open=False):
             gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
         with gr.Row(), gr.Accordion("📙 Citation", open=False):
             gr.Textbox(
             )
         gr.Markdown("## 🏆 TabArena Leaderboards")
+        gr.Markdown("Change the filters below to compare models with our without imputation across repeats, tasks, and dataset subsets.")
+        gr.Markdown("")
         lb_matrix = LBMatrix()
         impute_state = gr.State(lb_matrix.imputation[0])

website_texts.py CHANGED Viewed

@@ -5,6 +5,9 @@ INTRODUCTION_TEXT = """
 TabArena is a living benchmark system for predictive machine learning on tabular data.
 The goal of TabArena and its leaderboard is to assess the peak performance of
 model-specific pipelines.
 """
 OVERVIEW_DATASETS = """
@@ -32,7 +35,7 @@ the maintainers of TabArena.
 """
 OVERVIEW_METRICS = """
 The leaderboards are ranked based on Elo. We present several additional
-metrics. See `More Details` for more information on the metrics.
 **Imputation:** We also present results with imputation. The `Imputed` tab presents all results where we impute the
 performance for models that cannot run on all datasets due to task or dataset size constraints. In general, imputation
@@ -55,6 +58,7 @@ types and thus provides a reference for model-specific pipelines.
 ABOUT_TEXT = r"""
 ### Extended Overview of TabArena (References / Papers)
 We introduce TabArena and provide an overview of TabArena-v0.1.1 in our paper: https://tabarena.ai/paper-tabular-ml-iid-study.
 Moreover, you can find a presentation of TabArena-v0.1.1 here: https://www.youtube.com/watch?v=mcPRMcJHW2Y
 ### Using TabArena for Benchmarking
@@ -77,14 +81,20 @@ The leaderboard is ranked by Elo and includes several other metrics. Here is a s
 description for these metrics:
 #### Elo
-We evaluate models using the Elo rating system, following Chatbot Arena. Elo is a
 pairwise comparison-based rating system where each model's rating predicts its expected
 win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
 (91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
-random forest configuration across all figures, and perform 100 rounds of bootstrapping
 to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
 classification, log-loss for multiclass classification, and RMSE for regression.
 #### Score
 Following TabRepo, we compute a normalized score to provide an additional relative
 comparison. We linearly rescale the error such that the best method has a normalized
@@ -101,12 +111,6 @@ low ranks on some datasets. It therefore favors methods that are sometimes very
 and sometimes very bad over methods that are  always mediocre, as the former are more
 likely to be useful in conjunction with other methods.
-#### Improvability
-We introduce improvability as a metric that measures how many percent lower the error
-of the best method is than the current method on a dataset. This is then averaged over
-datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
-Improvability is always between 0\% and 100\%.
 ---
 ### Contact
@@ -127,21 +131,24 @@ The current core maintainers of TabArena are:
 CITATION_BUTTON_LABEL = (
     "If you use TabArena or the leaderboard in your research please cite the following:"
 )
-CITATION_BUTTON_TEXT = r"""@article{erickson2025tabarena,
-  title={TabArena: A Living Benchmark for Machine Learning on Tabular Data},
-  author={Nick Erickson and Lennart Purucker and Andrej Tschalzev and David Holzmüller and Prateek Mutalik Desai and David Salinas and Frank Hutter},
-  year={2025},
-  journal={arXiv preprint arXiv:2506.16791},
-  url={https://arxiv.org/abs/2506.16791},
 }
 """
 VERSION_HISTORY_BUTTON_TEXT = """
-**Current Version: TabArena-v0.1.2**
 The following details updates to the leaderboard (date format is YYYY/MM/DD):
-* 2025/12/01-v0.1.2: Add newest version of TabArena LB for NeurIPS 2025
     * New UI and new leaderboard subsets for different dataset sizes, tasks, and imputation + general polish.
     * Some metrics have been refactored and made more stable (see GitHub for details).
     * Updated Reference Pipeline to include AutoGluon v1.4 with the extreme preset.
@@ -152,6 +159,6 @@ The following details updates to the leaderboard (date format is YYYY/MM/DD):
  new overview; add Figures to LBs.
 * 2025/05-v0.1.0: Initialization of the TabArena-v0.1 leaderboard.
-Old Leaderboards can be found at:
 * Tabarena-v0.1 and TabArena-v0.1.1: https://huggingface.co/spaces/TabArena-Legacy/TabArena-v0.1.1
 """

 TabArena is a living benchmark system for predictive machine learning on tabular data.
 The goal of TabArena and its leaderboard is to assess the peak performance of
 model-specific pipelines.
+Expand the boxes below to learn more about the datasets, models, metrics, and reference pieplines.
+You can find more details and links to additional resources in the `About` section below.
 """
 OVERVIEW_DATASETS = """
 """
 OVERVIEW_METRICS = """
 The leaderboards are ranked based on Elo. We present several additional
+metrics. See `About` for more information on the metrics.
 **Imputation:** We also present results with imputation. The `Imputed` tab presents all results where we impute the
 performance for models that cannot run on all datasets due to task or dataset size constraints. In general, imputation
 ABOUT_TEXT = r"""
 ### Extended Overview of TabArena (References / Papers)
 We introduce TabArena and provide an overview of TabArena-v0.1.1 in our paper: https://tabarena.ai/paper-tabular-ml-iid-study.
 Moreover, you can find a presentation of TabArena-v0.1.1 here: https://www.youtube.com/watch?v=mcPRMcJHW2Y
 ### Using TabArena for Benchmarking
 description for these metrics:
 #### Elo
+We evaluate models using the Elo rating system. Elo is a
 pairwise comparison-based rating system where each model's rating predicts its expected
 win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
 (91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
+random forest configuration across all figures, and perform bootstrapping
 to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
 classification, log-loss for multiclass classification, and RMSE for regression.
+#### Improvability
+We introduce improvability as a metric that measures how many percent lower the error
+of the best method is than the current method on a dataset. This is then averaged over
+datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
+Improvability is always between 0\% and 100\%.
 #### Score
 Following TabRepo, we compute a normalized score to provide an additional relative
 comparison. We linearly rescale the error such that the best method has a normalized
 and sometimes very bad over methods that are  always mediocre, as the former are more
 likely to be useful in conjunction with other methods.
 ---
 ### Contact
 CITATION_BUTTON_LABEL = (
     "If you use TabArena or the leaderboard in your research please cite the following:"
 )
+CITATION_BUTTON_TEXT = r"""@inproceedings{erickson2025tabarena,
+  title     = {TabArena: A Living Benchmark for Machine Learning on Tabular Data},
+  author    = {Erickson, Nick and Purucker, Lennart and Tschalzev, Andrej and Holzm{\"u}ller, David and Desai, Prateek Mutalik and Salinas, David and Hutter, Frank},
+  booktitle = {Proceedings of the 39th Conference on Neural Information Processing Systems (NeurIPS)},
+  year      = {2025},
+  url       = {https://arxiv.org/abs/2506.16791}
 }
 """
 VERSION_HISTORY_BUTTON_TEXT = """
+**Current Version: TabArena-v0.1.2.1**
 The following details updates to the leaderboard (date format is YYYY/MM/DD):
+* 2025/12/18-v0.1.2.1:
+    * Make tuning trajectories start from the default configuration.
+    * UI improvements and more user-friendly explanations.
+* 2025/11/22-v0.1.2: Add newest version of TabArena LB for NeurIPS 2025
     * New UI and new leaderboard subsets for different dataset sizes, tasks, and imputation + general polish.
     * Some metrics have been refactored and made more stable (see GitHub for details).
     * Updated Reference Pipeline to include AutoGluon v1.4 with the extreme preset.
  new overview; add Figures to LBs.
 * 2025/05-v0.1.0: Initialization of the TabArena-v0.1 leaderboard.
+Old Leaderboards (with major changes) can be found at:
 * Tabarena-v0.1 and TabArena-v0.1.1: https://huggingface.co/spaces/TabArena-Legacy/TabArena-v0.1.1
 """