LennartPurucker commited on
Commit
e6fcd81
·
1 Parent(s): 3a4d296

fix filters

Browse files
Files changed (2) hide show
  1. main.py +16 -4
  2. website_texts.py +25 -18
main.py CHANGED
@@ -224,6 +224,8 @@ def make_leaderboard(lb: LBContainer) -> Leaderboard:
224
  " being able to run on all datasets.",
225
  )
226
  )
 
 
227
 
228
  return Leaderboard(
229
  # label=f"Full Leaderboard [{lb.name}]",
@@ -280,7 +282,7 @@ class LBMatrix:
280
  return (
281
  "Models (w/o imputation)"
282
  if lb_value == "no"
283
- else "Models (with imputation)"
284
  )
285
  if lb_key == "splits":
286
  return "All Repeats" if lb_value == "all" else "Lite"
@@ -303,7 +305,7 @@ class LBMatrix:
303
  case "medium":
304
  return "Medium"
305
  case "tabpfn":
306
- return "TabPFNv2-data"
307
  case _:
308
  raise ValueError()
309
  raise ValueError()
@@ -321,6 +323,14 @@ class LBMatrix:
321
  if element.imputation == "yes":
322
  blurb += "(imputed) "
323
  blurb += f"models."
 
 
 
 
 
 
 
 
324
  return blurb
325
 
326
 
@@ -414,7 +424,7 @@ def main():
414
  website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
415
  )
416
  with gr.Row():
417
- with gr.Column(), gr.Accordion("📈 Metrics", open=False):
418
  gr.Markdown(
419
  website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
420
  )
@@ -422,7 +432,7 @@ def main():
422
  gr.Markdown(
423
  website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
424
  )
425
- with gr.Row(), gr.Accordion("📝 More Details", open=False):
426
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
427
  with gr.Row(), gr.Accordion("📙 Citation", open=False):
428
  gr.Textbox(
@@ -434,6 +444,8 @@ def main():
434
  )
435
 
436
  gr.Markdown("## 🏆 TabArena Leaderboards")
 
 
437
  lb_matrix = LBMatrix()
438
 
439
  impute_state = gr.State(lb_matrix.imputation[0])
 
224
  " being able to run on all datasets.",
225
  )
226
  )
227
+ else:
228
+ df_leaderboard = df_leaderboard.drop(columns=["Imputed (%) [⬇️]"])
229
 
230
  return Leaderboard(
231
  # label=f"Full Leaderboard [{lb.name}]",
 
282
  return (
283
  "Models (w/o imputation)"
284
  if lb_value == "no"
285
+ else "🔹 Models (with imputation)"
286
  )
287
  if lb_key == "splits":
288
  return "All Repeats" if lb_value == "all" else "Lite"
 
305
  case "medium":
306
  return "Medium"
307
  case "tabpfn":
308
+ return "🔸 TabPFNv2-data"
309
  case _:
310
  raise ValueError()
311
  raise ValueError()
 
323
  if element.imputation == "yes":
324
  blurb += "(imputed) "
325
  blurb += f"models."
326
+
327
+ if datasets_name == "small":
328
+ blurb += "<br>Small datasets contain between 500 and 10,000 samples."
329
+ elif datasets_name == "medium":
330
+ blurb += "<br>Medium datasets contain between 10,000 and 250,000 samples."
331
+ elif datasets_name == "TabPFNv2-compatible":
332
+ blurb += "<br>TabPFNv2-compatible datasets contain at most 10,000 samples, 500 features, and 10 classes."
333
+
334
  return blurb
335
 
336
 
 
424
  website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
425
  )
426
  with gr.Row():
427
+ with gr.Column(), gr.Accordion("📈 Metrics, Imputation, Repeats", open=False):
428
  gr.Markdown(
429
  website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
430
  )
 
432
  gr.Markdown(
433
  website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
434
  )
435
+ with gr.Row(), gr.Accordion("📝 About", open=False):
436
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
437
  with gr.Row(), gr.Accordion("📙 Citation", open=False):
438
  gr.Textbox(
 
444
  )
445
 
446
  gr.Markdown("## 🏆 TabArena Leaderboards")
447
+ gr.Markdown("Change the filters below to compare models with our without imputation across repeats, tasks, and dataset subsets.")
448
+ gr.Markdown("")
449
  lb_matrix = LBMatrix()
450
 
451
  impute_state = gr.State(lb_matrix.imputation[0])
website_texts.py CHANGED
@@ -5,6 +5,9 @@ INTRODUCTION_TEXT = """
5
  TabArena is a living benchmark system for predictive machine learning on tabular data.
6
  The goal of TabArena and its leaderboard is to assess the peak performance of
7
  model-specific pipelines.
 
 
 
8
  """
9
 
10
  OVERVIEW_DATASETS = """
@@ -32,7 +35,7 @@ the maintainers of TabArena.
32
  """
33
  OVERVIEW_METRICS = """
34
  The leaderboards are ranked based on Elo. We present several additional
35
- metrics. See `More Details` for more information on the metrics.
36
 
37
  **Imputation:** We also present results with imputation. The `Imputed` tab presents all results where we impute the
38
  performance for models that cannot run on all datasets due to task or dataset size constraints. In general, imputation
@@ -55,6 +58,7 @@ types and thus provides a reference for model-specific pipelines.
55
  ABOUT_TEXT = r"""
56
  ### Extended Overview of TabArena (References / Papers)
57
  We introduce TabArena and provide an overview of TabArena-v0.1.1 in our paper: https://tabarena.ai/paper-tabular-ml-iid-study.
 
58
  Moreover, you can find a presentation of TabArena-v0.1.1 here: https://www.youtube.com/watch?v=mcPRMcJHW2Y
59
 
60
  ### Using TabArena for Benchmarking
@@ -77,14 +81,20 @@ The leaderboard is ranked by Elo and includes several other metrics. Here is a s
77
  description for these metrics:
78
 
79
  #### Elo
80
- We evaluate models using the Elo rating system, following Chatbot Arena. Elo is a
81
  pairwise comparison-based rating system where each model's rating predicts its expected
82
  win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
83
  (91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
84
- random forest configuration across all figures, and perform 100 rounds of bootstrapping
85
  to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
86
  classification, log-loss for multiclass classification, and RMSE for regression.
87
 
 
 
 
 
 
 
88
  #### Score
89
  Following TabRepo, we compute a normalized score to provide an additional relative
90
  comparison. We linearly rescale the error such that the best method has a normalized
@@ -101,12 +111,6 @@ low ranks on some datasets. It therefore favors methods that are sometimes very
101
  and sometimes very bad over methods that are always mediocre, as the former are more
102
  likely to be useful in conjunction with other methods.
103
 
104
- #### Improvability
105
- We introduce improvability as a metric that measures how many percent lower the error
106
- of the best method is than the current method on a dataset. This is then averaged over
107
- datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
108
- Improvability is always between 0\% and 100\%.
109
-
110
  ---
111
 
112
  ### Contact
@@ -127,21 +131,24 @@ The current core maintainers of TabArena are:
127
  CITATION_BUTTON_LABEL = (
128
  "If you use TabArena or the leaderboard in your research please cite the following:"
129
  )
130
- CITATION_BUTTON_TEXT = r"""@article{erickson2025tabarena,
131
- title={TabArena: A Living Benchmark for Machine Learning on Tabular Data},
132
- author={Nick Erickson and Lennart Purucker and Andrej Tschalzev and David Holzmüller and Prateek Mutalik Desai and David Salinas and Frank Hutter},
133
- year={2025},
134
- journal={arXiv preprint arXiv:2506.16791},
135
- url={https://arxiv.org/abs/2506.16791},
136
  }
137
  """
138
 
139
  VERSION_HISTORY_BUTTON_TEXT = """
140
- **Current Version: TabArena-v0.1.2**
141
 
142
  The following details updates to the leaderboard (date format is YYYY/MM/DD):
143
 
144
- * 2025/12/01-v0.1.2: Add newest version of TabArena LB for NeurIPS 2025
 
 
 
145
  * New UI and new leaderboard subsets for different dataset sizes, tasks, and imputation + general polish.
146
  * Some metrics have been refactored and made more stable (see GitHub for details).
147
  * Updated Reference Pipeline to include AutoGluon v1.4 with the extreme preset.
@@ -152,6 +159,6 @@ The following details updates to the leaderboard (date format is YYYY/MM/DD):
152
  new overview; add Figures to LBs.
153
  * 2025/05-v0.1.0: Initialization of the TabArena-v0.1 leaderboard.
154
 
155
- Old Leaderboards can be found at:
156
  * Tabarena-v0.1 and TabArena-v0.1.1: https://huggingface.co/spaces/TabArena-Legacy/TabArena-v0.1.1
157
  """
 
5
  TabArena is a living benchmark system for predictive machine learning on tabular data.
6
  The goal of TabArena and its leaderboard is to assess the peak performance of
7
  model-specific pipelines.
8
+
9
+ Expand the boxes below to learn more about the datasets, models, metrics, and reference pieplines.
10
+ You can find more details and links to additional resources in the `About` section below.
11
  """
12
 
13
  OVERVIEW_DATASETS = """
 
35
  """
36
  OVERVIEW_METRICS = """
37
  The leaderboards are ranked based on Elo. We present several additional
38
+ metrics. See `About` for more information on the metrics.
39
 
40
  **Imputation:** We also present results with imputation. The `Imputed` tab presents all results where we impute the
41
  performance for models that cannot run on all datasets due to task or dataset size constraints. In general, imputation
 
58
  ABOUT_TEXT = r"""
59
  ### Extended Overview of TabArena (References / Papers)
60
  We introduce TabArena and provide an overview of TabArena-v0.1.1 in our paper: https://tabarena.ai/paper-tabular-ml-iid-study.
61
+
62
  Moreover, you can find a presentation of TabArena-v0.1.1 here: https://www.youtube.com/watch?v=mcPRMcJHW2Y
63
 
64
  ### Using TabArena for Benchmarking
 
81
  description for these metrics:
82
 
83
  #### Elo
84
+ We evaluate models using the Elo rating system. Elo is a
85
  pairwise comparison-based rating system where each model's rating predicts its expected
86
  win probability against others, with a 400-point Elo gap corresponding to a 10 to 1
87
  (91\%) expected win rate. We calibrate 1000 Elo to the performance of our default
88
+ random forest configuration across all figures, and perform bootstrapping
89
  to obtain 95\% confidence intervals. Elo scores are computed using ROC AUC for binary
90
  classification, log-loss for multiclass classification, and RMSE for regression.
91
 
92
+ #### Improvability
93
+ We introduce improvability as a metric that measures how many percent lower the error
94
+ of the best method is than the current method on a dataset. This is then averaged over
95
+ datasets. Formally, for a single dataset improvability is (err_i - besterr_i)/err_i * 100\%.
96
+ Improvability is always between 0\% and 100\%.
97
+
98
  #### Score
99
  Following TabRepo, we compute a normalized score to provide an additional relative
100
  comparison. We linearly rescale the error such that the best method has a normalized
 
111
  and sometimes very bad over methods that are always mediocre, as the former are more
112
  likely to be useful in conjunction with other methods.
113
 
 
 
 
 
 
 
114
  ---
115
 
116
  ### Contact
 
131
  CITATION_BUTTON_LABEL = (
132
  "If you use TabArena or the leaderboard in your research please cite the following:"
133
  )
134
+ CITATION_BUTTON_TEXT = r"""@inproceedings{erickson2025tabarena,
135
+ title = {TabArena: A Living Benchmark for Machine Learning on Tabular Data},
136
+ author = {Erickson, Nick and Purucker, Lennart and Tschalzev, Andrej and Holzm{\"u}ller, David and Desai, Prateek Mutalik and Salinas, David and Hutter, Frank},
137
+ booktitle = {Proceedings of the 39th Conference on Neural Information Processing Systems (NeurIPS)},
138
+ year = {2025},
139
+ url = {https://arxiv.org/abs/2506.16791}
140
  }
141
  """
142
 
143
  VERSION_HISTORY_BUTTON_TEXT = """
144
+ **Current Version: TabArena-v0.1.2.1**
145
 
146
  The following details updates to the leaderboard (date format is YYYY/MM/DD):
147
 
148
+ * 2025/12/18-v0.1.2.1:
149
+ * Make tuning trajectories start from the default configuration.
150
+ * UI improvements and more user-friendly explanations.
151
+ * 2025/11/22-v0.1.2: Add newest version of TabArena LB for NeurIPS 2025
152
  * New UI and new leaderboard subsets for different dataset sizes, tasks, and imputation + general polish.
153
  * Some metrics have been refactored and made more stable (see GitHub for details).
154
  * Updated Reference Pipeline to include AutoGluon v1.4 with the extreme preset.
 
159
  new overview; add Figures to LBs.
160
  * 2025/05-v0.1.0: Initialization of the TabArena-v0.1 leaderboard.
161
 
162
+ Old Leaderboards (with major changes) can be found at:
163
  * Tabarena-v0.1 and TabArena-v0.1.1: https://huggingface.co/spaces/TabArena-Legacy/TabArena-v0.1.1
164
  """