Commit
·
9fa29df
1
Parent(s):
e5acaa3
style: Rename ScandEval to EuroEval
Browse files
app.py
CHANGED
|
@@ -26,18 +26,18 @@ INTRO_MARKDOWN = """
|
|
| 26 |
|
| 27 |
This demo allows you to generate a radial plot comparing the performance of different
|
| 28 |
language models on different tasks. It is based on the generative results from the
|
| 29 |
-
[
|
| 30 |
"""
|
| 31 |
|
| 32 |
|
| 33 |
ABOUT_MARKDOWN = """
|
| 34 |
-
## About the
|
| 35 |
|
| 36 |
-
The [
|
| 37 |
-
models on tasks in Danish,
|
| 38 |
-
|
| 39 |
BERT) and generative models (such as GPT), and leaderboards for both kinds [are
|
| 40 |
-
available](https://
|
| 41 |
|
| 42 |
The generative models are evaluated using in-context learning with few-shot prompts.
|
| 43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
|
@@ -54,10 +54,8 @@ the worst performing models having rank scores close to 0.
|
|
| 54 |
|
| 55 |
## The Benchmark Datasets
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are
|
| 60 |
-
the following:
|
| 61 |
|
| 62 |
### Text Classification
|
| 63 |
Given a piece of text, classify it into a number of classes. For this task we extract
|
|
@@ -110,7 +108,7 @@ Correlation Coefficient (MCC) as the evaluation metric.
|
|
| 110 |
|
| 111 |
## Citation
|
| 112 |
|
| 113 |
-
If you use the
|
| 114 |
paper](https://aclanthology.org/2023.nodalida-1.20):
|
| 115 |
|
| 116 |
```
|
|
@@ -741,16 +739,16 @@ def produce_radial_plot(
|
|
| 741 |
|
| 742 |
|
| 743 |
def fetch_results() -> dict[Language, pd.DataFrame]:
|
| 744 |
-
"""Fetch the results from the
|
| 745 |
|
| 746 |
Returns:
|
| 747 |
A dictionary of languages -> results-dataframes, whose indices are the
|
| 748 |
models and columns are the tasks.
|
| 749 |
"""
|
| 750 |
-
logger.info("Fetching results from
|
| 751 |
|
| 752 |
response = requests.get(
|
| 753 |
-
"https://raw.githubusercontent.com/
|
| 754 |
)
|
| 755 |
response.raise_for_status()
|
| 756 |
records = [
|
|
@@ -804,7 +802,7 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 804 |
).dropna()
|
| 805 |
results_dfs[language] = results_df
|
| 806 |
|
| 807 |
-
logger.info("Successfully fetched results from
|
| 808 |
|
| 809 |
return results_dfs
|
| 810 |
|
|
|
|
| 26 |
|
| 27 |
This demo allows you to generate a radial plot comparing the performance of different
|
| 28 |
language models on different tasks. It is based on the generative results from the
|
| 29 |
+
[EuroEval benchmark](https://euroeval.com).
|
| 30 |
"""
|
| 31 |
|
| 32 |
|
| 33 |
ABOUT_MARKDOWN = """
|
| 34 |
+
## About the EuroEval Benchmark
|
| 35 |
|
| 36 |
+
The [EuroEval benchmark](https://euroeval.com) is used compare pretrained language
|
| 37 |
+
models on tasks in Danish, Dutch, English, Faroese, French, German, Icelandic, Italian,
|
| 38 |
+
Norwegian and Swedish. The benchmark supports both encoder models (such as
|
| 39 |
BERT) and generative models (such as GPT), and leaderboards for both kinds [are
|
| 40 |
+
available](https://euroeval.com).
|
| 41 |
|
| 42 |
The generative models are evaluated using in-context learning with few-shot prompts.
|
| 43 |
The few-shot examples are sampled randomly from the training split, and we benchmark
|
|
|
|
| 54 |
|
| 55 |
## The Benchmark Datasets
|
| 56 |
|
| 57 |
+
For each language, the benchmark consists of 7 different tasks, each of which consists
|
| 58 |
+
of 1-2 datasets. The tasks are the following:
|
|
|
|
|
|
|
| 59 |
|
| 60 |
### Text Classification
|
| 61 |
Given a piece of text, classify it into a number of classes. For this task we extract
|
|
|
|
| 108 |
|
| 109 |
## Citation
|
| 110 |
|
| 111 |
+
If you use the EuroEval benchmark in your work, please cite [the
|
| 112 |
paper](https://aclanthology.org/2023.nodalida-1.20):
|
| 113 |
|
| 114 |
```
|
|
|
|
| 739 |
|
| 740 |
|
| 741 |
def fetch_results() -> dict[Language, pd.DataFrame]:
|
| 742 |
+
"""Fetch the results from the EuroEval benchmark.
|
| 743 |
|
| 744 |
Returns:
|
| 745 |
A dictionary of languages -> results-dataframes, whose indices are the
|
| 746 |
models and columns are the tasks.
|
| 747 |
"""
|
| 748 |
+
logger.info("Fetching results from EuroEval benchmark...")
|
| 749 |
|
| 750 |
response = requests.get(
|
| 751 |
+
"https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/results/results.jsonl"
|
| 752 |
)
|
| 753 |
response.raise_for_status()
|
| 754 |
records = [
|
|
|
|
| 802 |
).dropna()
|
| 803 |
results_dfs[language] = results_df
|
| 804 |
|
| 805 |
+
logger.info("Successfully fetched results from EuroEval benchmark.")
|
| 806 |
|
| 807 |
return results_dfs
|
| 808 |
|