AudioBench-Leaderboard-Extend

Running

binwang commited on Dec 23, 2024

Commit

fa6ba7b

verified ·

1 Parent(s): 81a8ab0

Upload folder using huggingface_hub

Files changed (3) hide show

app/content.py CHANGED Viewed

@@ -68,6 +68,10 @@ cnasr_datasets = {
     'Aishell-ASR-ZH-Test': 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.'
 }
 metrics = {
     'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
     'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
@@ -84,6 +88,7 @@ metrics_info = {
     'bleu': 'BLEU Score. The higher, the better.',
 }
 dataname_column_rename_in_table = {
     'librispeech_test_clean'    : 'LibriSpeech-Clean',
     'librispeech_test_other'    : 'LibriSpeech-Other',
@@ -126,5 +131,7 @@ dataname_column_rename_in_table = {
     'imda_part5_30s_asr_test'   : 'IMDA-Part5-30s-ASR',
     'imda_part6_30s_asr_test'   : 'IMDA-Part6-30s-ASR',
 }

     'Aishell-ASR-ZH-Test': 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.'
 }
+MUSIC_MCQ_DATASETS = {
+    'MuChoMusic-Test': 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.'
+}
 metrics = {
     'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
     'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
     'bleu': 'BLEU Score. The higher, the better.',
 }
 dataname_column_rename_in_table = {
     'librispeech_test_clean'    : 'LibriSpeech-Clean',
     'librispeech_test_other'    : 'LibriSpeech-Other',
     'imda_part5_30s_asr_test'   : 'IMDA-Part5-30s-ASR',
     'imda_part6_30s_asr_test'   : 'IMDA-Part6-30s-ASR',
+    'muchomusic_test'          : 'MuChoMusic'
 }

app/pages.py CHANGED Viewed

@@ -373,8 +373,37 @@ def spt():
     if filter_1:
         if filter_1 in sum:
-            sum_table_mulit_metrix('ST', ['bleu'])
         else:
             dataset_contents(spt_datasets[filter_1], metrics['bleu'])
             draw('su', 'ST', filter_1, 'bleu')

     if filter_1:
         if filter_1 in sum:
+            sum_table_mulit_metrix('st', ['bleu'])
         else:
             dataset_contents(spt_datasets[filter_1], metrics['bleu'])
             draw('su', 'ST', filter_1, 'bleu')
+def music_mcq():
+    st.title("Task: Music Understanding - MCQ Questions")
+    sum = ['Overall']
+    dataset_lists =  ['MuChoMusic-Test',
+                      ]
+    filters_levelone = sum + dataset_lists
+    left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
+    with left:
+        filter_1 = st.selectbox('Dataset', filters_levelone)
+    if filter_1:
+        if filter_1 in sum:
+            sum_table_mulit_metrix('music_mcq', ['llama3_70b_judge_binary'])
+        else:
+            dataset_contents(MUSIC_MCQ_DATASETS[filter_1], metrics['llama3_70b_judge_binary'])
+            draw('vu', 'music_mcq', filter_1, 'llama3_70b_judge_binary')

app/summarization.py CHANGED Viewed

@@ -21,7 +21,7 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
     # combine chart data from multiple sources
     chart_data = pd.DataFrame()
     for metrics in metrics_lists:
-        folder = f"./results/{metrics}/"
         data_path = f'{folder}/{task_name.lower()}.csv'
         one_chart_data = pd.read_csv(data_path).round(3)
         if len(chart_data) == 0:

     # combine chart data from multiple sources
     chart_data = pd.DataFrame()
     for metrics in metrics_lists:
+        folder = f"./results/{metrics}"
         data_path = f'{folder}/{task_name.lower()}.csv'
         one_chart_data = pd.read_csv(data_path).round(3)
         if len(chart_data) == 0: