Commit
·
637c71d
1
Parent(s):
4a4e1c1
fix: Use all datasets from a task, use ranks instead of log_ranks
Browse files
app.py
CHANGED
|
@@ -647,8 +647,8 @@ def produce_radial_plot(
|
|
| 647 |
best_scores = scores
|
| 648 |
ranks.append(rank)
|
| 649 |
|
| 650 |
-
|
| 651 |
-
scores = 1 - (
|
| 652 |
for model_id, score in zip(model_ids_sorted, scores):
|
| 653 |
all_rank_scores[task][language][model_id] = score
|
| 654 |
logger.info("Successfully computed rank scores.")
|
|
@@ -786,15 +786,22 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 786 |
for test_score_dict in raw_results
|
| 787 |
]
|
| 788 |
if dataset.task in data_dict[model_name]:
|
| 789 |
-
data_dict[model_name][dataset.task]
|
| 790 |
else:
|
| 791 |
-
data_dict[model_name][dataset.task] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
results_df = pd.DataFrame(data_dict).T.map(
|
| 793 |
lambda lists_or_nan:
|
| 794 |
-
list(it.chain(lists_or_nan))
|
| 795 |
-
if lists_or_nan
|
| 796 |
else lists_or_nan
|
| 797 |
-
).dropna()
|
| 798 |
results_dfs[language] = results_df
|
| 799 |
|
| 800 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|
|
|
|
| 647 |
best_scores = scores
|
| 648 |
ranks.append(rank)
|
| 649 |
|
| 650 |
+
ranks = np.asarray(ranks)
|
| 651 |
+
scores = 1 - (ranks / ranks.max())
|
| 652 |
for model_id, score in zip(model_ids_sorted, scores):
|
| 653 |
all_rank_scores[task][language][model_id] = score
|
| 654 |
logger.info("Successfully computed rank scores.")
|
|
|
|
| 786 |
for test_score_dict in raw_results
|
| 787 |
]
|
| 788 |
if dataset.task in data_dict[model_name]:
|
| 789 |
+
data_dict[model_name][dataset.task][dataset] = scores
|
| 790 |
else:
|
| 791 |
+
data_dict[model_name][dataset.task] = {dataset: scores}
|
| 792 |
+
|
| 793 |
+
# Compute the task scores as the mean of the scores for each dataset
|
| 794 |
+
for model_name, task_dict in data_dict.items():
|
| 795 |
+
for task, dataset_dict in task_dict.items():
|
| 796 |
+
values = np.asarray(list(dataset_dict.values())).mean(axis=0)
|
| 797 |
+
data_dict[model_name][task] = values
|
| 798 |
+
|
| 799 |
results_df = pd.DataFrame(data_dict).T.map(
|
| 800 |
lambda lists_or_nan:
|
| 801 |
+
list(it.chain(*lists_or_nan))
|
| 802 |
+
if isinstance(lists_or_nan, list)
|
| 803 |
else lists_or_nan
|
| 804 |
+
).dropna()
|
| 805 |
results_dfs[language] = results_df
|
| 806 |
|
| 807 |
logger.info("Successfully fetched results from ScandEval benchmark.")
|