Commit
·
c97530c
1
Parent(s):
6bdb37f
feat: Use actual ranks on scale
Browse files
app.py
CHANGED
|
@@ -126,7 +126,7 @@ paper](https://aclanthology.org/2023.nodalida-1.20):
|
|
| 126 |
UPDATE_FREQUENCY_MINUTES = 5
|
| 127 |
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200
|
| 128 |
DEFAULT_LANGUAGES = ["Danish"]
|
| 129 |
-
DEFAULT_MODELS = ["gpt-4-0613", "
|
| 130 |
|
| 131 |
|
| 132 |
class Task(BaseModel):
|
|
@@ -633,6 +633,7 @@ def produce_radial_plot(
|
|
| 633 |
for task in tasks:
|
| 634 |
for language in languages:
|
| 635 |
df = results_dfs_filtered[language][task].dropna()
|
|
|
|
| 636 |
model_ids_sorted: list[str] = (
|
| 637 |
df.map(np.mean).sort_values(ascending=False).index.tolist()
|
| 638 |
)
|
|
@@ -649,14 +650,15 @@ def produce_radial_plot(
|
|
| 649 |
a=best_scores, b=scores, alternative="greater"
|
| 650 |
).pvalue < 0.05
|
| 651 |
if worse_than_previous_models:
|
| 652 |
-
|
|
|
|
|
|
|
| 653 |
best_scores = scores
|
| 654 |
ranks.append(rank)
|
| 655 |
|
| 656 |
ranks = np.asarray(ranks)
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
all_rank_scores[task][language][model_id] = score
|
| 660 |
logger.info("Successfully computed rank scores.")
|
| 661 |
|
| 662 |
# Add all the evaluation results for each model
|
|
@@ -671,7 +673,7 @@ def produce_radial_plot(
|
|
| 671 |
if model_id not in results_dfs_filtered[language].index:
|
| 672 |
continue
|
| 673 |
|
| 674 |
-
rank_score =
|
| 675 |
rank_scores.append(rank_score)
|
| 676 |
|
| 677 |
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
|
|
@@ -699,7 +701,9 @@ def produce_radial_plot(
|
|
| 699 |
# Sort the models (and their results) such that the model who beats most other
|
| 700 |
# models first. This will result in the "smaller areas" being on top of the "larger
|
| 701 |
# areas", which is more aesthetically pleasing.
|
| 702 |
-
sorted_idxs = num_models_beaten.sum(axis=1).argsort()
|
|
|
|
|
|
|
| 703 |
model_ids = np.asarray(model_ids)[sorted_idxs].tolist()
|
| 704 |
results = result_matrix[sorted_idxs].tolist()
|
| 705 |
|
|
@@ -734,7 +738,11 @@ def produce_radial_plot(
|
|
| 734 |
|
| 735 |
# Builds the radial plot from the results
|
| 736 |
fig.update_layout(
|
| 737 |
-
polar=dict(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
showlegend=True,
|
| 739 |
title=title,
|
| 740 |
width=plot_width,
|
|
|
|
| 126 |
UPDATE_FREQUENCY_MINUTES = 5
|
| 127 |
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200
|
| 128 |
DEFAULT_LANGUAGES = ["Danish"]
|
| 129 |
+
DEFAULT_MODELS = ["gpt-4-0613", "google/gemma-3-12b-it"]
|
| 130 |
|
| 131 |
|
| 132 |
class Task(BaseModel):
|
|
|
|
| 633 |
for task in tasks:
|
| 634 |
for language in languages:
|
| 635 |
df = results_dfs_filtered[language][task].dropna()
|
| 636 |
+
stddev = df.map(np.mean).std()
|
| 637 |
model_ids_sorted: list[str] = (
|
| 638 |
df.map(np.mean).sort_values(ascending=False).index.tolist()
|
| 639 |
)
|
|
|
|
| 650 |
a=best_scores, b=scores, alternative="greater"
|
| 651 |
).pvalue < 0.05
|
| 652 |
if worse_than_previous_models:
|
| 653 |
+
difference = np.mean(best_scores) - np.mean(scores)
|
| 654 |
+
normalised_difference = difference / stddev
|
| 655 |
+
rank += normalised_difference
|
| 656 |
best_scores = scores
|
| 657 |
ranks.append(rank)
|
| 658 |
|
| 659 |
ranks = np.asarray(ranks)
|
| 660 |
+
for model_id, rank in zip(model_ids_sorted, ranks):
|
| 661 |
+
all_rank_scores[task][language][model_id] = rank
|
|
|
|
| 662 |
logger.info("Successfully computed rank scores.")
|
| 663 |
|
| 664 |
# Add all the evaluation results for each model
|
|
|
|
| 673 |
if model_id not in results_dfs_filtered[language].index:
|
| 674 |
continue
|
| 675 |
|
| 676 |
+
rank_score = all_rank_scores[task][language][model_id]
|
| 677 |
rank_scores.append(rank_score)
|
| 678 |
|
| 679 |
score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
|
|
|
|
| 701 |
# Sort the models (and their results) such that the model who beats most other
|
| 702 |
# models first. This will result in the "smaller areas" being on top of the "larger
|
| 703 |
# areas", which is more aesthetically pleasing.
|
| 704 |
+
sorted_idxs = num_models_beaten.sum(axis=1).argsort()
|
| 705 |
+
if not use_rank_score:
|
| 706 |
+
sorted_idxs = sorted_idxs[::-1]
|
| 707 |
model_ids = np.asarray(model_ids)[sorted_idxs].tolist()
|
| 708 |
results = result_matrix[sorted_idxs].tolist()
|
| 709 |
|
|
|
|
| 738 |
|
| 739 |
# Builds the radial plot from the results
|
| 740 |
fig.update_layout(
|
| 741 |
+
polar=dict(
|
| 742 |
+
radialaxis=dict(
|
| 743 |
+
visible=show_scale, range=[5, 1] if use_rank_score else [0, 100]
|
| 744 |
+
),
|
| 745 |
+
),
|
| 746 |
showlegend=True,
|
| 747 |
title=title,
|
| 748 |
width=plot_width,
|