Commit
·
5f70754
1
Parent(s):
734648f
feat: Update app with log rank scores
Browse files
app.py
CHANGED
|
@@ -43,11 +43,14 @@ The generative models are evaluated using in-context learning with few-shot prom
|
|
| 43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
| 44 |
the models 10 times with bootstrapped test sets and different few-shot examples in each
|
| 45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
| 46 |
-
uncertainty in the radial plot when we compute the
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
## The Benchmark Datasets
|
| 53 |
|
|
@@ -276,8 +279,9 @@ def main() -> None:
|
|
| 276 |
scale=2,
|
| 277 |
)
|
| 278 |
with gr.Row():
|
| 279 |
-
|
| 280 |
-
label="Compare models with
|
|
|
|
| 281 |
value=True,
|
| 282 |
interactive=True,
|
| 283 |
scale=1,
|
|
@@ -316,7 +320,7 @@ def main() -> None:
|
|
| 316 |
value=produce_radial_plot(
|
| 317 |
model_ids_dropdown.value,
|
| 318 |
language_names=language_names_dropdown.value,
|
| 319 |
-
|
| 320 |
show_scale=show_scale_checkbox.value,
|
| 321 |
plot_width=plot_width_slider.value,
|
| 322 |
plot_height=plot_height_slider.value,
|
|
@@ -346,7 +350,7 @@ def main() -> None:
|
|
| 346 |
inputs=[
|
| 347 |
model_ids_dropdown,
|
| 348 |
language_names_dropdown,
|
| 349 |
-
|
| 350 |
show_scale_checkbox,
|
| 351 |
plot_width_slider,
|
| 352 |
plot_height_slider,
|
|
@@ -355,7 +359,7 @@ def main() -> None:
|
|
| 355 |
)
|
| 356 |
language_names_dropdown.change(**update_plot_kwargs)
|
| 357 |
model_ids_dropdown.change(**update_plot_kwargs)
|
| 358 |
-
|
| 359 |
show_scale_checkbox.change(**update_plot_kwargs)
|
| 360 |
plot_width_slider.change(**update_plot_kwargs)
|
| 361 |
plot_height_slider.change(**update_plot_kwargs)
|
|
@@ -453,7 +457,7 @@ def update_model_ids_dropdown(
|
|
| 453 |
def produce_radial_plot(
|
| 454 |
model_ids: list[str],
|
| 455 |
language_names: list[str],
|
| 456 |
-
|
| 457 |
show_scale: bool,
|
| 458 |
plot_width: int,
|
| 459 |
plot_height: int,
|
|
@@ -466,8 +470,8 @@ def produce_radial_plot(
|
|
| 466 |
The ids of the models to include in the plot.
|
| 467 |
language_names:
|
| 468 |
The names of the languages to include in the plot.
|
| 469 |
-
|
| 470 |
-
Whether to use
|
| 471 |
show_scale:
|
| 472 |
Whether to show the scale on the plot.
|
| 473 |
plot_width:
|
|
@@ -515,8 +519,8 @@ def produce_radial_plot(
|
|
| 515 |
]
|
| 516 |
|
| 517 |
|
| 518 |
-
logger.info("Computing
|
| 519 |
-
|
| 520 |
task: {
|
| 521 |
language: dict()
|
| 522 |
for language in languages
|
|
@@ -546,10 +550,11 @@ def produce_radial_plot(
|
|
| 546 |
best_scores = scores
|
| 547 |
ranks.append(rank)
|
| 548 |
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
|
|
|
| 553 |
|
| 554 |
# Add all the evaluation results for each model
|
| 555 |
results: list[list[float]] = list()
|
|
@@ -557,7 +562,7 @@ def produce_radial_plot(
|
|
| 557 |
result_list = list()
|
| 558 |
for task in tasks:
|
| 559 |
|
| 560 |
-
|
| 561 |
scores = list()
|
| 562 |
for language in languages:
|
| 563 |
if model_id not in results_dfs_filtered[language].index:
|
|
@@ -565,15 +570,15 @@ def produce_radial_plot(
|
|
| 565 |
|
| 566 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
| 567 |
|
| 568 |
-
|
| 569 |
-
|
| 570 |
|
| 571 |
if np.mean(score_list) < 1:
|
| 572 |
score_list = [100 * score for score in score_list]
|
| 573 |
|
| 574 |
scores.append(np.mean(score_list))
|
| 575 |
-
if
|
| 576 |
-
result_list.append(np.mean(
|
| 577 |
else:
|
| 578 |
result_list.append(np.mean(scores))
|
| 579 |
results.append(result_list)
|
|
@@ -616,10 +621,10 @@ def produce_radial_plot(
|
|
| 616 |
languages_str += " and "
|
| 617 |
languages_str += languages[-1].name
|
| 618 |
|
| 619 |
-
if
|
| 620 |
-
title = f'
|
| 621 |
else:
|
| 622 |
-
title = f'
|
| 623 |
|
| 624 |
# Builds the radial plot from the results
|
| 625 |
fig.update_layout(
|
|
|
|
| 43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
| 44 |
the models 10 times with bootstrapped test sets and different few-shot examples in each
|
| 45 |
iteration. This allows us to better measure the uncertainty of the results. We use the
|
| 46 |
+
uncertainty in the radial plot when we compute the rank scores for the models. Namely,
|
| 47 |
+
we compute the rank score by firstly computing the rank of the model on each task,
|
| 48 |
+
where two models are considered to have the same rank if they have there is not a
|
| 49 |
+
statistically significant difference between their scores (one-tailed t-test with p <
|
| 50 |
+
0.05). We next apply a logaritmic transformation to the ranks, to downplay the
|
| 51 |
+
importance of the poorly performing models. Lastly, we invert and normalise the
|
| 52 |
+
logaritmic ranks to the range [0, 1], resulting in the best performing models having
|
| 53 |
+
large rank scores and the worst performing models having small rank scores.
|
| 54 |
|
| 55 |
## The Benchmark Datasets
|
| 56 |
|
|
|
|
| 279 |
scale=2,
|
| 280 |
)
|
| 281 |
with gr.Row():
|
| 282 |
+
use_rank_score_checkbox = gr.Checkbox(
|
| 283 |
+
label="Compare models with rank scores (as opposed to raw "
|
| 284 |
+
"scores)",
|
| 285 |
value=True,
|
| 286 |
interactive=True,
|
| 287 |
scale=1,
|
|
|
|
| 320 |
value=produce_radial_plot(
|
| 321 |
model_ids_dropdown.value,
|
| 322 |
language_names=language_names_dropdown.value,
|
| 323 |
+
use_rank_score=use_rank_score_checkbox.value,
|
| 324 |
show_scale=show_scale_checkbox.value,
|
| 325 |
plot_width=plot_width_slider.value,
|
| 326 |
plot_height=plot_height_slider.value,
|
|
|
|
| 350 |
inputs=[
|
| 351 |
model_ids_dropdown,
|
| 352 |
language_names_dropdown,
|
| 353 |
+
use_rank_score_checkbox,
|
| 354 |
show_scale_checkbox,
|
| 355 |
plot_width_slider,
|
| 356 |
plot_height_slider,
|
|
|
|
| 359 |
)
|
| 360 |
language_names_dropdown.change(**update_plot_kwargs)
|
| 361 |
model_ids_dropdown.change(**update_plot_kwargs)
|
| 362 |
+
use_rank_score_checkbox.change(**update_plot_kwargs)
|
| 363 |
show_scale_checkbox.change(**update_plot_kwargs)
|
| 364 |
plot_width_slider.change(**update_plot_kwargs)
|
| 365 |
plot_height_slider.change(**update_plot_kwargs)
|
|
|
|
| 457 |
def produce_radial_plot(
|
| 458 |
model_ids: list[str],
|
| 459 |
language_names: list[str],
|
| 460 |
+
use_rank_score: bool,
|
| 461 |
show_scale: bool,
|
| 462 |
plot_width: int,
|
| 463 |
plot_height: int,
|
|
|
|
| 470 |
The ids of the models to include in the plot.
|
| 471 |
language_names:
|
| 472 |
The names of the languages to include in the plot.
|
| 473 |
+
use_rank_score:
|
| 474 |
+
Whether to use rank scores (as opposed to raw scores).
|
| 475 |
show_scale:
|
| 476 |
Whether to show the scale on the plot.
|
| 477 |
plot_width:
|
|
|
|
| 519 |
]
|
| 520 |
|
| 521 |
|
| 522 |
+
logger.info("Computing rank scores...")
|
| 523 |
+
all_rank_scores: dict[Task, dict[Language, dict[str, float]]] = {
|
| 524 |
task: {
|
| 525 |
language: dict()
|
| 526 |
for language in languages
|
|
|
|
| 550 |
best_scores = scores
|
| 551 |
ranks.append(rank)
|
| 552 |
|
| 553 |
+
log_ranks = np.log(ranks)
|
| 554 |
+
scores = log_ranks / log_ranks.max()
|
| 555 |
+
for model_id, score in zip(model_ids_sorted, scores):
|
| 556 |
+
all_rank_scores[task][language][model_id] = 1 - score
|
| 557 |
+
logger.info("Successfully computed rank scores.")
|
| 558 |
|
| 559 |
# Add all the evaluation results for each model
|
| 560 |
results: list[list[float]] = list()
|
|
|
|
| 562 |
result_list = list()
|
| 563 |
for task in tasks:
|
| 564 |
|
| 565 |
+
rank_scores = list()
|
| 566 |
scores = list()
|
| 567 |
for language in languages:
|
| 568 |
if model_id not in results_dfs_filtered[language].index:
|
|
|
|
| 570 |
|
| 571 |
score_list = results_dfs_filtered[language].loc[model_id][task]
|
| 572 |
|
| 573 |
+
rank_score = 100 * all_rank_scores[task][language][model_id]
|
| 574 |
+
rank_scores.append(rank_score)
|
| 575 |
|
| 576 |
if np.mean(score_list) < 1:
|
| 577 |
score_list = [100 * score for score in score_list]
|
| 578 |
|
| 579 |
scores.append(np.mean(score_list))
|
| 580 |
+
if use_rank_score:
|
| 581 |
+
result_list.append(np.mean(rank_scores))
|
| 582 |
else:
|
| 583 |
result_list.append(np.mean(scores))
|
| 584 |
results.append(result_list)
|
|
|
|
| 621 |
languages_str += " and "
|
| 622 |
languages_str += languages[-1].name
|
| 623 |
|
| 624 |
+
if use_rank_score:
|
| 625 |
+
title = f'Rank Score on on {languages_str} Language Tasks'
|
| 626 |
else:
|
| 627 |
+
title = f'Raw Score on on {languages_str} Language Tasks'
|
| 628 |
|
| 629 |
# Builds the radial plot from the results
|
| 630 |
fig.update_layout(
|