Upload folder using huggingface_hub
Browse files- app/content.py +48 -34
- app/draw_diagram.py +16 -12
- app/pages.py +30 -3
app/content.py
CHANGED
|
@@ -7,8 +7,15 @@ asr_datsets = {'LibriSpeech-Test-Clean': 'A clean, high-quality testset of the L
|
|
| 7 |
'Earnings22-Test' : 'Similar to Earnings21, but covering earnings calls from 2022.',
|
| 8 |
'Tedlium3-Test' : 'A test set derived from TED talks, covering diverse speakers and topics.',
|
| 9 |
'Tedlium3-Long-form-Test': 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
|
|
|
|
|
|
|
|
|
|
| 10 |
'IMDA-Part1-ASR-Test' : 'Speech recognition test data from the IMDA NSC project, Part 1.',
|
| 11 |
-
'IMDA-Part2-ASR-Test' : 'Speech recognition test data from the IMDA NSC project, Part
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
}
|
| 13 |
|
| 14 |
sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
|
|
@@ -78,39 +85,46 @@ metrics_info = {
|
|
| 78 |
}
|
| 79 |
|
| 80 |
dataname_column_rename_in_table = {
|
| 81 |
-
'librispeech_test_clean'
|
| 82 |
-
'librispeech_test_other'
|
| 83 |
-
'common_lvoice_15_en_test': 'CommonVoice-15-EN',
|
| 84 |
-
'peoples_speech_test'
|
| 85 |
-
'gigaspeech_test'
|
| 86 |
-
'earnings21_test'
|
| 87 |
-
'earnings22_test'
|
| 88 |
-
'tedlium3_test'
|
| 89 |
-
'tedlium3_long_form_test': 'TED-LIUM-3-Long',
|
| 90 |
-
'aishell_asr_zh_test'
|
| 91 |
-
'covost2_en_id_test'
|
| 92 |
-
'covost2_en_zh_test'
|
| 93 |
-
'covost2_en_ta_test'
|
| 94 |
-
'covost2_id_en_test'
|
| 95 |
-
'covost2_zh_en_test'
|
| 96 |
-
'covost2_ta_en_test'
|
| 97 |
'cn_college_listen_mcq_test': 'CN-College-Listen-MCQ',
|
| 98 |
-
'dream_tts_mcq_test'
|
| 99 |
-
'slue_p2_sqa5_test'
|
| 100 |
-
'public_sg_speech_qa_test': 'Public-SG-Speech-QA',
|
| 101 |
-
'spoken_squad_test'
|
| 102 |
-
'openhermes_audio_test'
|
| 103 |
-
'alpaca_audio_test'
|
| 104 |
-
'wavcaps_test'
|
| 105 |
-
'audiocaps_test'
|
| 106 |
-
'clotho_aqa_test'
|
| 107 |
-
'wavcaps_qa_test'
|
| 108 |
-
'audiocaps_qa_test'
|
| 109 |
-
'voxceleb_accent_test'
|
| 110 |
-
'voxceleb_gender_test'
|
| 111 |
-
'iemocap_gender_test': 'IEMOCAP-Gender',
|
| 112 |
-
'iemocap_emotion_test': 'IEMOCAP-Emotion',
|
| 113 |
-
'meld_sentiment_test': 'MELD-Sentiment',
|
| 114 |
-
'meld_emotion_test': 'MELD-Emotion',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
}
|
|
|
|
| 7 |
'Earnings22-Test' : 'Similar to Earnings21, but covering earnings calls from 2022.',
|
| 8 |
'Tedlium3-Test' : 'A test set derived from TED talks, covering diverse speakers and topics.',
|
| 9 |
'Tedlium3-Long-form-Test': 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
singlish_asr_datasets = {
|
| 13 |
'IMDA-Part1-ASR-Test' : 'Speech recognition test data from the IMDA NSC project, Part 1.',
|
| 14 |
+
'IMDA-Part2-ASR-Test' : 'Speech recognition test data from the IMDA NSC project, Part 2.',
|
| 15 |
+
'IMDA-Part3-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 3.',
|
| 16 |
+
'IMDA-Part4-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 4.',
|
| 17 |
+
'IMDA-Part5-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 5.',
|
| 18 |
+
'IMDA-Part6-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 6.'
|
| 19 |
}
|
| 20 |
|
| 21 |
sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
|
|
|
|
| 85 |
}
|
| 86 |
|
| 87 |
dataname_column_rename_in_table = {
|
| 88 |
+
'librispeech_test_clean' : 'LibriSpeech-Clean',
|
| 89 |
+
'librispeech_test_other' : 'LibriSpeech-Other',
|
| 90 |
+
'common_lvoice_15_en_test' : 'CommonVoice-15-EN',
|
| 91 |
+
'peoples_speech_test' : 'Peoples-Speech',
|
| 92 |
+
'gigaspeech_test' : 'GigaSpeech-1',
|
| 93 |
+
'earnings21_test' : 'Earnings-21',
|
| 94 |
+
'earnings22_test' : 'Earnings-22',
|
| 95 |
+
'tedlium3_test' : 'TED-LIUM-3',
|
| 96 |
+
'tedlium3_long_form_test' : 'TED-LIUM-3-Long',
|
| 97 |
+
'aishell_asr_zh_test' : 'Aishell-ASR-ZH',
|
| 98 |
+
'covost2_en_id_test' : 'Covost2-EN-ID',
|
| 99 |
+
'covost2_en_zh_test' : 'Covost2-EN-ZH',
|
| 100 |
+
'covost2_en_ta_test' : 'Covost2-EN-TA',
|
| 101 |
+
'covost2_id_en_test' : 'Covost2-ID-EN',
|
| 102 |
+
'covost2_zh_en_test' : 'Covost2-ZH-EN',
|
| 103 |
+
'covost2_ta_en_test' : 'Covost2-TA-EN',
|
| 104 |
'cn_college_listen_mcq_test': 'CN-College-Listen-MCQ',
|
| 105 |
+
'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
|
| 106 |
+
'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
|
| 107 |
+
'public_sg_speech_qa_test' : 'Public-SG-Speech-QA',
|
| 108 |
+
'spoken_squad_test' : 'Spoken-SQuAD',
|
| 109 |
+
'openhermes_audio_test' : 'OpenHermes-Audio',
|
| 110 |
+
'alpaca_audio_test' : 'ALPACA-Audio',
|
| 111 |
+
'wavcaps_test' : 'WavCaps',
|
| 112 |
+
'audiocaps_test' : 'AudioCaps',
|
| 113 |
+
'clotho_aqa_test' : 'Clotho-AQA',
|
| 114 |
+
'wavcaps_qa_test' : 'WavCaps-QA',
|
| 115 |
+
'audiocaps_qa_test' : 'AudioCaps-QA',
|
| 116 |
+
'voxceleb_accent_test' : 'VoxCeleb-Accent',
|
| 117 |
+
'voxceleb_gender_test' : 'VoxCeleb-Gender',
|
| 118 |
+
'iemocap_gender_test' : 'IEMOCAP-Gender',
|
| 119 |
+
'iemocap_emotion_test' : 'IEMOCAP-Emotion',
|
| 120 |
+
'meld_sentiment_test' : 'MELD-Sentiment',
|
| 121 |
+
'meld_emotion_test' : 'MELD-Emotion',
|
| 122 |
+
'imda_part1_asr_test' : 'IMDA-Part1-ASR',
|
| 123 |
+
'imda_part2_asr_test' : 'IMDA-Part2-ASR',
|
| 124 |
+
'imda_part3_30s_asr_test' : 'IMDA-Part3-30s-ASR',
|
| 125 |
+
'imda_part4_30s_asr_test' : 'IMDA-Part4-30s-ASR',
|
| 126 |
+
'imda_part5_30s_asr_test' : 'IMDA-Part5-30s-ASR',
|
| 127 |
+
'imda_part6_30s_asr_test' : 'IMDA-Part6-30s-ASR',
|
| 128 |
+
|
| 129 |
|
| 130 |
}
|
app/draw_diagram.py
CHANGED
|
@@ -90,18 +90,22 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
|
|
| 90 |
return df_style
|
| 91 |
|
| 92 |
if cur_dataset_name in [
|
| 93 |
-
'
|
| 94 |
-
'
|
| 95 |
-
'
|
| 96 |
-
'
|
| 97 |
-
'
|
| 98 |
-
'
|
| 99 |
-
'
|
| 100 |
-
'
|
| 101 |
-
'
|
| 102 |
-
'
|
| 103 |
-
'
|
| 104 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
]:
|
| 106 |
|
| 107 |
chart_data_table = chart_data_table.sort_values(
|
|
|
|
| 90 |
return df_style
|
| 91 |
|
| 92 |
if cur_dataset_name in [
|
| 93 |
+
'LibriSpeech-Clean',
|
| 94 |
+
'LibriSpeech-Other',
|
| 95 |
+
'CommonVoice-15-EN',
|
| 96 |
+
'Peoples-Speech',
|
| 97 |
+
'GigaSpeech-1',
|
| 98 |
+
'Earnings-21',
|
| 99 |
+
'Earnings-22',
|
| 100 |
+
'TED-LIUM-3',
|
| 101 |
+
'TED-LIUM-3-Long',
|
| 102 |
+
'Aishell-ASR-ZH',
|
| 103 |
+
'IMDA-Part1-ASR',
|
| 104 |
+
'IMDA-Part2-ASR',
|
| 105 |
+
'IMDA-Part3-30s-ASR',
|
| 106 |
+
'IMDA-Part4-30s-ASR',
|
| 107 |
+
'IMDA-Part5-30s-ASR',
|
| 108 |
+
'IMDA-Part6-30s-ASR',
|
| 109 |
]:
|
| 110 |
|
| 111 |
chart_data_table = chart_data_table.sort_values(
|
app/pages.py
CHANGED
|
@@ -40,8 +40,8 @@ def dashboard():
|
|
| 40 |
audio_url = "https://arxiv.org/abs/2406.16020"
|
| 41 |
|
| 42 |
st.markdown("#### News")
|
| 43 |
-
st.markdown("**Dec, 2024**: Update layout and support comparison between models with similar model sizes. Layout reorganized for better user experience. Add performance summary for each task.")
|
| 44 |
-
st.markdown("**
|
| 45 |
|
| 46 |
st.divider()
|
| 47 |
|
|
@@ -56,7 +56,6 @@ def dashboard():
|
|
| 56 |
with center_co:
|
| 57 |
st.image("./style/audio_overview.png",
|
| 58 |
caption="Overview of the datasets in AudioBench.",
|
| 59 |
-
# use_container_width = True
|
| 60 |
)
|
| 61 |
|
| 62 |
st.markdown('''
|
|
@@ -116,6 +115,34 @@ def asr():
|
|
| 116 |
draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
|
| 117 |
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def cnasr():
|
| 120 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
| 121 |
|
|
|
|
| 40 |
audio_url = "https://arxiv.org/abs/2406.16020"
|
| 41 |
|
| 42 |
st.markdown("#### News")
|
| 43 |
+
st.markdown("**Dec 11, 2024**: Update layout and support comparison between models with similar model sizes. Layout reorganized for better user experience. Add performance summary for each task.")
|
| 44 |
+
st.markdown("**Aug, 2024**: Initial leaderboard online.")
|
| 45 |
|
| 46 |
st.divider()
|
| 47 |
|
|
|
|
| 56 |
with center_co:
|
| 57 |
st.image("./style/audio_overview.png",
|
| 58 |
caption="Overview of the datasets in AudioBench.",
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
st.markdown('''
|
|
|
|
| 115 |
draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
|
| 116 |
|
| 117 |
|
| 118 |
+
def singlish_asr():
|
| 119 |
+
st.title("Task: Automatic Speech Recognition - Singlish")
|
| 120 |
+
|
| 121 |
+
sum = ['Overall']
|
| 122 |
+
dataset_lists = [
|
| 123 |
+
'IMDA-Part1-ASR-Test',
|
| 124 |
+
'IMDA-Part2-ASR-Test',
|
| 125 |
+
'IMDA-Part3-30s-ASR-Test',
|
| 126 |
+
'IMDA-Part4-30s-ASR-Test',
|
| 127 |
+
'IMDA-Part5-30s-ASR-Test',
|
| 128 |
+
'IMDA-Part6-30s-ASR-Test',
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
filters_levelone = sum + dataset_lists
|
| 132 |
+
|
| 133 |
+
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
|
| 134 |
+
|
| 135 |
+
with left:
|
| 136 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
| 137 |
+
|
| 138 |
+
if filter_1:
|
| 139 |
+
if filter_1 in sum:
|
| 140 |
+
sum_table_mulit_metrix('singlish_asr', ['wer'])
|
| 141 |
+
else:
|
| 142 |
+
dataset_contents(singlish_asr_datasets[filter_1], metrics['wer'])
|
| 143 |
+
draw('su', 'singlish_asr', filter_1, 'wer')
|
| 144 |
+
|
| 145 |
+
|
| 146 |
def cnasr():
|
| 147 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
| 148 |
|