Spaces:

justus-tobias
/

ASR_Model_Comparison

Paused

App Files Files Community

j-tobias commited on Aug 14, 2024

Commit

234fe59

1 Parent(s): 8cfce12

added new dataset + time meassurement

Browse files

Files changed (2) hide show

app.py +1 -1
processing.py +55 -50

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ login(hf_token)
 # GENERAL OPTIONS FOR MODELS AND DATASETS
 MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
-DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
 # HELPER FUNCTIONS
 def get_card(selected_model:str)->str:

 # GENERAL OPTIONS FOR MODELS AND DATASETS
 MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
+DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recoding/Sample"]
 # HELPER FUNCTIONS
 def get_card(selected_model:str)->str:

processing.py CHANGED Viewed

@@ -9,6 +9,7 @@ import librosa
 import torch
 import numpy as np
 import pandas as pd
 N_SAMPLES = 30
@@ -25,13 +26,13 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
     if data_subset == "Common Voice":
         dataset, text_column = load_Common_Voice()
-    elif data_subset == "VoxPopuli":
-        dataset, text_column = load_Vox_Populi()
     elif data_subset == "Librispeech ASR clean":
         dataset, text_column = load_Librispeech_ASR_clean()
     elif data_subset == "OWN Recoding/Sample":
         sr, audio = own_audio
-        audio = audio.astype(np.float32) / 32768.0
         print("AUDIO: ", type(audio), audio)
         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
     else:
@@ -47,12 +48,24 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
     if data_subset == "OWN Recoding/Sample":
         sample = {"audio":{"array":audio,"sampling_rate":16000}}
         transcription1 = model_compute(model1, processor1, sample, model_1)
         transcription2 = model_compute(model2, processor2, sample, model_2)
         transcriptions1 = [transcription1]
         transcriptions2 = [transcription2]
-        references = [own_transcription]
         wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
         wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
@@ -60,9 +73,11 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
         results_md = f"""
         #### {model_1}
         - WER Score: {wer1}
         #### {model_2}
-        - WER Score: {wer2}"""
         # Create the bar plot
         fig = go.Figure(
@@ -89,6 +104,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
         transcriptions2 = []
         WER1s = []
         WER2s = []
         counter = 0
@@ -99,34 +116,51 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
             references.append(sample[text_column])
             if model_1 == model_2:
                 transcription = model_compute(model1, processor1, sample, model_1)
                 transcriptions1.append(transcription)
                 transcriptions2.append(transcription)
             else:
                 transcription1 = model_compute(model1, processor1, sample, model_1)
-                transcription2 = model_compute(model2, processor2, sample, model_2)
                 transcriptions1.append(transcription1)
                 transcriptions2.append(transcription2)
-            WER1s.append(compute_wer([sample[text_column]], [transcription1]))
-            WER2s.append(compute_wer([sample[text_column]], [transcription2]))
             results_md = f"""
             {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
             #### {model_1}
-            - WER Score: {round(sum(WER1s)/len(WER1s), 2)}
             #### {model_2}
-            - WER Score: {round(sum(WER2s)/len(WER2s), 2)}"""
             # Create the bar plot
             fig = go.Figure(
                 data=[
-                    go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/len(WER1s)], showlegend=False),
-                    go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/len(WER2s)], showlegend=False),
                 ]
             )
@@ -138,7 +172,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
                 barmode="group",
             )
-            df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s})
             yield results_md, fig, df
@@ -156,32 +190,19 @@ def load_Common_Voice():
         sample["text"] = sample["text"].lower()
     return dataset, text_column
-def load_Vox_Populi():
-    # Load the dataset in streaming mode
-    dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
-    # Optionally, preview the first item to understand the structure (can be removed in production)
     print(next(iter(dataset)))
-    # Take the first 120 examples to work with
-    dataset = dataset.take(N_SAMPLES+20)
-    text_column = "normalized_text"
-    # Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
-    dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
-    # Take the first 100 examples after filtering
     dataset = dataset.take(N_SAMPLES)
-    # Cast the 'audio' column to the desired sampling rate
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
-    # Convert to list and return
     dataset = list(dataset)
     return dataset, text_column
-def load_Librispeech_ASR_clean():
-    dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
     print(next(iter(dataset)))
     text_column = "text"
     dataset = dataset.take(N_SAMPLES)
@@ -191,22 +212,6 @@ def load_Librispeech_ASR_clean():
         sample["text"] = sample["text"].lower()
     return dataset, text_column
-def is_valid_sample(text, audio):
-    # Check if 'normalized_text' is valid
-    text = text.strip()
-    if text == "" or text == "ignore time segment in scoring":
-        return False
-    # Check if the 'audio' array is valid (not empty and meets length criteria)
-    if len(audio['array']) == 0:  # Audio is empty
-        return False
-    # Optionally, check if the audio duration is within a certain range
-    duration = audio['array'].size / audio['sampling_rate']
-    if duration < 1.0 or duration > 60.0:  # Example: Filter out audio shorter than 1 second or longer than 60 seconds
-        return False
-    return True
 # MODEL LOADERS

 import torch
 import numpy as np
 import pandas as pd
+import time
 N_SAMPLES = 30
     if data_subset == "Common Voice":
         dataset, text_column = load_Common_Voice()
     elif data_subset == "Librispeech ASR clean":
         dataset, text_column = load_Librispeech_ASR_clean()
+    elif data_subset == "Librispeech ASR other":
+        dataset, text_column = load_Librispeech_ASR_other()
     elif data_subset == "OWN Recoding/Sample":
         sr, audio = own_audio
+        audio = audio.astype(np.float32)
         print("AUDIO: ", type(audio), audio)
         audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
     else:
     if data_subset == "OWN Recoding/Sample":
         sample = {"audio":{"array":audio,"sampling_rate":16000}}
+        inference_times1 = []
+        inference_times2 = []
+        time_start = time.time()
         transcription1 = model_compute(model1, processor1, sample, model_1)
+        time_stop = time.time()
+        duration = time_stop - time_start
+        inference_times1.append(duration)
+        time_start = time.time()
         transcription2 = model_compute(model2, processor2, sample, model_2)
+        time_stop = time.time()
+        duration = time_stop - time_start
+        inference_times2.append(duration)
         transcriptions1 = [transcription1]
         transcriptions2 = [transcription2]
+        references = [own_transcription.lower()]
         wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
         wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
         results_md = f"""
         #### {model_1}
         - WER Score: {wer1}
+        - Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
         #### {model_2}
+        - WER Score: {wer2}
+        - Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
         # Create the bar plot
         fig = go.Figure(
         transcriptions2 = []
         WER1s = []
         WER2s = []
+        inference_times1 = []
+        inference_times2 = []
         counter = 0
             references.append(sample[text_column])
             if model_1 == model_2:
+                time_start = time.time()
                 transcription = model_compute(model1, processor1, sample, model_1)
+                time_stop = time.time()
+                duration = time_stop - time_start
+                inference_times1.append(duration)
+                inference_times2.append(duration)
                 transcriptions1.append(transcription)
                 transcriptions2.append(transcription)
             else:
+                time_start = time.time()
                 transcription1 = model_compute(model1, processor1, sample, model_1)
+                time_stop = time.time()
+                duration = time_stop - time_start
+                inference_times1.append(duration)
                 transcriptions1.append(transcription1)
+                time_start = time.time()
+                transcription2 = model_compute(model2, processor2, sample, model_2)
+                time_stop = time.time()
+                duration = time_stop - time_start
+                inference_times2.append(duration)
                 transcriptions2.append(transcription2)
+            WER1s.append(round(compute_wer([sample[text_column]], [transcription1]),4))
+            WER2s.append(round(compute_wer([sample[text_column]], [transcription2]),4))
+            wer1 = round(sum(WER1s)/len(WER1s), 4)
+            wer2 = round(sum(WER2s)/len(WER2s), 4)
             results_md = f"""
             {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
             #### {model_1}
+            - WER Score: {wer1}
+            - Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
             #### {model_2}
+            - WER Score: {wer2}
+            - Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
             # Create the bar plot
             fig = go.Figure(
                 data=[
+                    go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
+                    go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
                 ]
             )
                 barmode="group",
             )
+            df = pd.DataFrame({"references":references, f"{model_1}":transcriptions1,"WER 1":WER1s,f"{model_2}":transcriptions2,"WER 2":WER2s})
             yield results_md, fig, df
         sample["text"] = sample["text"].lower()
     return dataset, text_column
+def load_Librispeech_ASR_clean():
+    dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
     print(next(iter(dataset)))
+    text_column = "text"
     dataset = dataset.take(N_SAMPLES)
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
     dataset = list(dataset)
+    for sample in dataset:
+        sample["text"] = sample["text"].lower()
     return dataset, text_column
+def load_Librispeech_ASR_other():
+    dataset = load_dataset("librispeech_asr", "other", split="test", streaming=True, token=True, trust_remote_code=True)
     print(next(iter(dataset)))
     text_column = "text"
     dataset = dataset.take(N_SAMPLES)
         sample["text"] = sample["text"].lower()
     return dataset, text_column
 # MODEL LOADERS