Spaces:
Running
Running
Commit
·
e6fcd81
1
Parent(s):
3a4d296
fix filters
Browse files- main.py +16 -4
- website_texts.py +25 -18
main.py
CHANGED
|
@@ -224,6 +224,8 @@ def make_leaderboard(lb: LBContainer) -> Leaderboard:
|
|
| 224 |
" being able to run on all datasets.",
|
| 225 |
)
|
| 226 |
)
|
|
|
|
|
|
|
| 227 |
|
| 228 |
return Leaderboard(
|
| 229 |
# label=f"Full Leaderboard [{lb.name}]",
|
|
@@ -280,7 +282,7 @@ class LBMatrix:
|
|
| 280 |
return (
|
| 281 |
"Models (w/o imputation)"
|
| 282 |
if lb_value == "no"
|
| 283 |
-
else "Models (with imputation)"
|
| 284 |
)
|
| 285 |
if lb_key == "splits":
|
| 286 |
return "All Repeats" if lb_value == "all" else "Lite"
|
|
@@ -303,7 +305,7 @@ class LBMatrix:
|
|
| 303 |
case "medium":
|
| 304 |
return "Medium"
|
| 305 |
case "tabpfn":
|
| 306 |
-
return "TabPFNv2-data"
|
| 307 |
case _:
|
| 308 |
raise ValueError()
|
| 309 |
raise ValueError()
|
|
@@ -321,6 +323,14 @@ class LBMatrix:
|
|
| 321 |
if element.imputation == "yes":
|
| 322 |
blurb += "(imputed) "
|
| 323 |
blurb += f"models."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
return blurb
|
| 325 |
|
| 326 |
|
|
@@ -414,7 +424,7 @@ def main():
|
|
| 414 |
website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
|
| 415 |
)
|
| 416 |
with gr.Row():
|
| 417 |
-
with gr.Column(), gr.Accordion("📈 Metrics", open=False):
|
| 418 |
gr.Markdown(
|
| 419 |
website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
|
| 420 |
)
|
|
@@ -422,7 +432,7 @@ def main():
|
|
| 422 |
gr.Markdown(
|
| 423 |
website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
|
| 424 |
)
|
| 425 |
-
with gr.Row(), gr.Accordion("📝
|
| 426 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
|
| 427 |
with gr.Row(), gr.Accordion("📙 Citation", open=False):
|
| 428 |
gr.Textbox(
|
|
@@ -434,6 +444,8 @@ def main():
|
|
| 434 |
)
|
| 435 |
|
| 436 |
gr.Markdown("## 🏆 TabArena Leaderboards")
|
|
|
|
|
|
|
| 437 |
lb_matrix = LBMatrix()
|
| 438 |
|
| 439 |
impute_state = gr.State(lb_matrix.imputation[0])
|
|
|
|
| 224 |
" being able to run on all datasets.",
|
| 225 |
)
|
| 226 |
)
|
| 227 |
+
else:
|
| 228 |
+
df_leaderboard = df_leaderboard.drop(columns=["Imputed (%) [⬇️]"])
|
| 229 |
|
| 230 |
return Leaderboard(
|
| 231 |
# label=f"Full Leaderboard [{lb.name}]",
|
|
|
|
| 282 |
return (
|
| 283 |
"Models (w/o imputation)"
|
| 284 |
if lb_value == "no"
|
| 285 |
+
else "🔹 Models (with imputation)"
|
| 286 |
)
|
| 287 |
if lb_key == "splits":
|
| 288 |
return "All Repeats" if lb_value == "all" else "Lite"
|
|
|
|
| 305 |
case "medium":
|
| 306 |
return "Medium"
|
| 307 |
case "tabpfn":
|
| 308 |
+
return "🔸 TabPFNv2-data"
|
| 309 |
case _:
|
| 310 |
raise ValueError()
|
| 311 |
raise ValueError()
|
|
|
|
| 323 |
if element.imputation == "yes":
|
| 324 |
blurb += "(imputed) "
|
| 325 |
blurb += f"models."
|
| 326 |
+
|
| 327 |
+
if datasets_name == "small":
|
| 328 |
+
blurb += "<br>Small datasets contain between 500 and 10,000 samples."
|
| 329 |
+
elif datasets_name == "medium":
|
| 330 |
+
blurb += "<br>Medium datasets contain between 10,000 and 250,000 samples."
|
| 331 |
+
elif datasets_name == "TabPFNv2-compatible":
|
| 332 |
+
blurb += "<br>TabPFNv2-compatible datasets contain at most 10,000 samples, 500 features, and 10 classes."
|
| 333 |
+
|
| 334 |
return blurb
|
| 335 |
|
| 336 |
|
|
|
|
| 424 |
website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
|
| 425 |
)
|
| 426 |
with gr.Row():
|
| 427 |
+
with gr.Column(), gr.Accordion("📈 Metrics, Imputation, Repeats", open=False):
|
| 428 |
gr.Markdown(
|
| 429 |
website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
|
| 430 |
)
|
|
|
|
| 432 |
gr.Markdown(
|
| 433 |
website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
|
| 434 |
)
|
| 435 |
+
with gr.Row(), gr.Accordion("📝 About", open=False):
|
| 436 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
|
| 437 |
with gr.Row(), gr.Accordion("📙 Citation", open=False):
|
| 438 |
gr.Textbox(
|
|
|
|
| 444 |
)
|
| 445 |
|
| 446 |
gr.Markdown("## 🏆 TabArena Leaderboards")
|
| 447 |
+
gr.Markdown("Change the filters below to compare models with our without imputation across repeats, tasks, and dataset subsets.")
|
| 448 |
+
gr.Markdown("")
|
| 449 |
lb_matrix = LBMatrix()
|
| 450 |
|
| 451 |
impute_state = gr.State(lb_matrix.imputation[0])
|
website_texts.py
CHANGED
|
@@ -5,6 +5,9 @@ INTRODUCTION_TEXT = """
|
|
| 5 |
TabArena is a living benchmark system for predictive machine learning on tabular data.
|
| 6 |
The goal of TabArena and its leaderboard is to assess the peak performance of
|
| 7 |
model-specific pipelines.
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
OVERVIEW_DATASETS = """
|
|
@@ -32,7 +35,7 @@ the maintainers of TabArena.
|
|
| 32 |
"""
|
| 33 |
OVERVIEW_METRICS = """
|
| 34 |
The leaderboards are ranked based on Elo. We present several additional
|
| 35 |
-
metrics. See `
|
| 36 |
|
| 37 |
**Imputation:** We also present results with imputation. The `Imputed` tab presents all results where we impute the
|
| 38 |
performance for models that cannot run on all datasets due to task or dataset size constraints. In general, imputation
|
|
@@ -55,6 +58,7 @@ types and thus provides a reference for model-specific pipelines.
|
|
| 55 |
ABOUT_TEXT = r"""
|
| 56 |
### Extended Overview of TabArena (References / Papers)
|
| 57 |
We introduce TabArena and provide an overview of TabArena-v0.1.1 in our paper: https://tabarena.ai/paper-tabular-ml-iid-study.
|
|
|
|
| 58 |
Moreover, you can find a presentation of TabArena-v0.1.1 here: https://www.youtube.com/watch?v=mcPRMcJHW2Y
|
| 59 |
|
| 60 |
### Using TabArena for Benchmarking
|
|
@@ -77,14 +81,20 @@ The leaderboard is ranked by Elo and includes several other metrics. Here is a s
|
|
| 77 |
description for these metrics:
|
| 78 |
|
| 79 |
#### Elo
|
| 80 |
-
We evaluate models using the Elo rating system
|
| 81 |
pairwise comparison-based rating system where each model's rating predicts its expected
|
| 82 |
win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
|
| 83 |
(91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
|
| 84 |
-
random forest configuration across all figures, and perform
|
| 85 |
to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
|
| 86 |
classification, log-loss for multiclass classification, and RMSE for regression.
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
#### Score
|
| 89 |
Following TabRepo, we compute a normalized score to provide an additional relative
|
| 90 |
comparison. We linearly rescale the error such that the best method has a normalized
|
|
@@ -101,12 +111,6 @@ low ranks on some datasets. It therefore favors methods that are sometimes very
|
|
| 101 |
and sometimes very bad over methods that are always mediocre, as the former are more
|
| 102 |
likely to be useful in conjunction with other methods.
|
| 103 |
|
| 104 |
-
#### Improvability
|
| 105 |
-
We introduce improvability as a metric that measures how many percent lower the error
|
| 106 |
-
of the best method is than the current method on a dataset. This is then averaged over
|
| 107 |
-
datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
|
| 108 |
-
Improvability is always between 0\% and 100\%.
|
| 109 |
-
|
| 110 |
---
|
| 111 |
|
| 112 |
### Contact
|
|
@@ -127,21 +131,24 @@ The current core maintainers of TabArena are:
|
|
| 127 |
CITATION_BUTTON_LABEL = (
|
| 128 |
"If you use TabArena or the leaderboard in your research please cite the following:"
|
| 129 |
)
|
| 130 |
-
CITATION_BUTTON_TEXT = r"""@
|
| 131 |
-
title={TabArena: A Living Benchmark for Machine Learning on Tabular Data},
|
| 132 |
-
author={Nick
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
url={https://arxiv.org/abs/2506.16791}
|
| 136 |
}
|
| 137 |
"""
|
| 138 |
|
| 139 |
VERSION_HISTORY_BUTTON_TEXT = """
|
| 140 |
-
**Current Version: TabArena-v0.1.2**
|
| 141 |
|
| 142 |
The following details updates to the leaderboard (date format is YYYY/MM/DD):
|
| 143 |
|
| 144 |
-
* 2025/12/
|
|
|
|
|
|
|
|
|
|
| 145 |
* New UI and new leaderboard subsets for different dataset sizes, tasks, and imputation + general polish.
|
| 146 |
* Some metrics have been refactored and made more stable (see GitHub for details).
|
| 147 |
* Updated Reference Pipeline to include AutoGluon v1.4 with the extreme preset.
|
|
@@ -152,6 +159,6 @@ The following details updates to the leaderboard (date format is YYYY/MM/DD):
|
|
| 152 |
new overview; add Figures to LBs.
|
| 153 |
* 2025/05-v0.1.0: Initialization of the TabArena-v0.1 leaderboard.
|
| 154 |
|
| 155 |
-
Old Leaderboards can be found at:
|
| 156 |
* Tabarena-v0.1 and TabArena-v0.1.1: https://huggingface.co/spaces/TabArena-Legacy/TabArena-v0.1.1
|
| 157 |
"""
|
|
|
|
| 5 |
TabArena is a living benchmark system for predictive machine learning on tabular data.
|
| 6 |
The goal of TabArena and its leaderboard is to assess the peak performance of
|
| 7 |
model-specific pipelines.
|
| 8 |
+
|
| 9 |
+
Expand the boxes below to learn more about the datasets, models, metrics, and reference pieplines.
|
| 10 |
+
You can find more details and links to additional resources in the `About` section below.
|
| 11 |
"""
|
| 12 |
|
| 13 |
OVERVIEW_DATASETS = """
|
|
|
|
| 35 |
"""
|
| 36 |
OVERVIEW_METRICS = """
|
| 37 |
The leaderboards are ranked based on Elo. We present several additional
|
| 38 |
+
metrics. See `About` for more information on the metrics.
|
| 39 |
|
| 40 |
**Imputation:** We also present results with imputation. The `Imputed` tab presents all results where we impute the
|
| 41 |
performance for models that cannot run on all datasets due to task or dataset size constraints. In general, imputation
|
|
|
|
| 58 |
ABOUT_TEXT = r"""
|
| 59 |
### Extended Overview of TabArena (References / Papers)
|
| 60 |
We introduce TabArena and provide an overview of TabArena-v0.1.1 in our paper: https://tabarena.ai/paper-tabular-ml-iid-study.
|
| 61 |
+
|
| 62 |
Moreover, you can find a presentation of TabArena-v0.1.1 here: https://www.youtube.com/watch?v=mcPRMcJHW2Y
|
| 63 |
|
| 64 |
### Using TabArena for Benchmarking
|
|
|
|
| 81 |
description for these metrics:
|
| 82 |
|
| 83 |
#### Elo
|
| 84 |
+
We evaluate models using the Elo rating system. Elo is a
|
| 85 |
pairwise comparison-based rating system where each model's rating predicts its expected
|
| 86 |
win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
|
| 87 |
(91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
|
| 88 |
+
random forest configuration across all figures, and perform bootstrapping
|
| 89 |
to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
|
| 90 |
classification, log-loss for multiclass classification, and RMSE for regression.
|
| 91 |
|
| 92 |
+
#### Improvability
|
| 93 |
+
We introduce improvability as a metric that measures how many percent lower the error
|
| 94 |
+
of the best method is than the current method on a dataset. This is then averaged over
|
| 95 |
+
datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
|
| 96 |
+
Improvability is always between 0\% and 100\%.
|
| 97 |
+
|
| 98 |
#### Score
|
| 99 |
Following TabRepo, we compute a normalized score to provide an additional relative
|
| 100 |
comparison. We linearly rescale the error such that the best method has a normalized
|
|
|
|
| 111 |
and sometimes very bad over methods that are always mediocre, as the former are more
|
| 112 |
likely to be useful in conjunction with other methods.
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
---
|
| 115 |
|
| 116 |
### Contact
|
|
|
|
| 131 |
CITATION_BUTTON_LABEL = (
|
| 132 |
"If you use TabArena or the leaderboard in your research please cite the following:"
|
| 133 |
)
|
| 134 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{erickson2025tabarena,
|
| 135 |
+
title = {TabArena: A Living Benchmark for Machine Learning on Tabular Data},
|
| 136 |
+
author = {Erickson, Nick and Purucker, Lennart and Tschalzev, Andrej and Holzm{\"u}ller, David and Desai, Prateek Mutalik and Salinas, David and Hutter, Frank},
|
| 137 |
+
booktitle = {Proceedings of the 39th Conference on Neural Information Processing Systems (NeurIPS)},
|
| 138 |
+
year = {2025},
|
| 139 |
+
url = {https://arxiv.org/abs/2506.16791}
|
| 140 |
}
|
| 141 |
"""
|
| 142 |
|
| 143 |
VERSION_HISTORY_BUTTON_TEXT = """
|
| 144 |
+
**Current Version: TabArena-v0.1.2.1**
|
| 145 |
|
| 146 |
The following details updates to the leaderboard (date format is YYYY/MM/DD):
|
| 147 |
|
| 148 |
+
* 2025/12/18-v0.1.2.1:
|
| 149 |
+
* Make tuning trajectories start from the default configuration.
|
| 150 |
+
* UI improvements and more user-friendly explanations.
|
| 151 |
+
* 2025/11/22-v0.1.2: Add newest version of TabArena LB for NeurIPS 2025
|
| 152 |
* New UI and new leaderboard subsets for different dataset sizes, tasks, and imputation + general polish.
|
| 153 |
* Some metrics have been refactored and made more stable (see GitHub for details).
|
| 154 |
* Updated Reference Pipeline to include AutoGluon v1.4 with the extreme preset.
|
|
|
|
| 159 |
new overview; add Figures to LBs.
|
| 160 |
* 2025/05-v0.1.0: Initialization of the TabArena-v0.1 leaderboard.
|
| 161 |
|
| 162 |
+
Old Leaderboards (with major changes) can be found at:
|
| 163 |
* Tabarena-v0.1 and TabArena-v0.1.1: https://huggingface.co/spaces/TabArena-Legacy/TabArena-v0.1.1
|
| 164 |
"""
|