j-tobias
commited on
Commit
·
234fe59
1
Parent(s):
8cfce12
added new dataset + time meassurement
Browse files- app.py +1 -1
- processing.py +55 -50
app.py
CHANGED
|
@@ -26,7 +26,7 @@ login(hf_token)
|
|
| 26 |
|
| 27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
| 28 |
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
| 29 |
-
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
|
| 30 |
|
| 31 |
# HELPER FUNCTIONS
|
| 32 |
def get_card(selected_model:str)->str:
|
|
|
|
| 26 |
|
| 27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
| 28 |
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
| 29 |
+
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recoding/Sample"]
|
| 30 |
|
| 31 |
# HELPER FUNCTIONS
|
| 32 |
def get_card(selected_model:str)->str:
|
processing.py
CHANGED
|
@@ -9,6 +9,7 @@ import librosa
|
|
| 9 |
import torch
|
| 10 |
import numpy as np
|
| 11 |
import pandas as pd
|
|
|
|
| 12 |
|
| 13 |
N_SAMPLES = 30
|
| 14 |
|
|
@@ -25,13 +26,13 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
| 25 |
|
| 26 |
if data_subset == "Common Voice":
|
| 27 |
dataset, text_column = load_Common_Voice()
|
| 28 |
-
elif data_subset == "VoxPopuli":
|
| 29 |
-
dataset, text_column = load_Vox_Populi()
|
| 30 |
elif data_subset == "Librispeech ASR clean":
|
| 31 |
dataset, text_column = load_Librispeech_ASR_clean()
|
|
|
|
|
|
|
| 32 |
elif data_subset == "OWN Recoding/Sample":
|
| 33 |
sr, audio = own_audio
|
| 34 |
-
audio = audio.astype(np.float32)
|
| 35 |
print("AUDIO: ", type(audio), audio)
|
| 36 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 37 |
else:
|
|
@@ -47,12 +48,24 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
| 47 |
|
| 48 |
if data_subset == "OWN Recoding/Sample":
|
| 49 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
transcription2 = model_compute(model2, processor2, sample, model_2)
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
transcriptions1 = [transcription1]
|
| 54 |
transcriptions2 = [transcription2]
|
| 55 |
-
references = [own_transcription]
|
| 56 |
|
| 57 |
wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
|
| 58 |
wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
|
|
@@ -60,9 +73,11 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
| 60 |
results_md = f"""
|
| 61 |
#### {model_1}
|
| 62 |
- WER Score: {wer1}
|
|
|
|
| 63 |
|
| 64 |
#### {model_2}
|
| 65 |
-
- WER Score: {wer2}
|
|
|
|
| 66 |
|
| 67 |
# Create the bar plot
|
| 68 |
fig = go.Figure(
|
|
@@ -89,6 +104,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
| 89 |
transcriptions2 = []
|
| 90 |
WER1s = []
|
| 91 |
WER2s = []
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
counter = 0
|
|
@@ -99,34 +116,51 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
| 99 |
references.append(sample[text_column])
|
| 100 |
|
| 101 |
if model_1 == model_2:
|
|
|
|
| 102 |
transcription = model_compute(model1, processor1, sample, model_1)
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
| 104 |
transcriptions1.append(transcription)
|
| 105 |
transcriptions2.append(transcription)
|
| 106 |
else:
|
|
|
|
| 107 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
transcriptions1.append(transcription1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
transcriptions2.append(transcription2)
|
| 111 |
|
| 112 |
-
WER1s.append(compute_wer([sample[text_column]], [transcription1]))
|
| 113 |
-
WER2s.append(compute_wer([sample[text_column]], [transcription2]))
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
results_md = f"""
|
| 117 |
{i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
|
| 118 |
|
| 119 |
#### {model_1}
|
| 120 |
-
- WER Score: {
|
|
|
|
| 121 |
|
| 122 |
#### {model_2}
|
| 123 |
-
- WER Score: {
|
|
|
|
| 124 |
|
| 125 |
# Create the bar plot
|
| 126 |
fig = go.Figure(
|
| 127 |
data=[
|
| 128 |
-
go.Bar(x=[f"{model_1}"], y=[
|
| 129 |
-
go.Bar(x=[f"{model_2}"], y=[
|
| 130 |
]
|
| 131 |
)
|
| 132 |
|
|
@@ -138,7 +172,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
| 138 |
barmode="group",
|
| 139 |
)
|
| 140 |
|
| 141 |
-
df = pd.DataFrame({"references":references, "
|
| 142 |
|
| 143 |
yield results_md, fig, df
|
| 144 |
|
|
@@ -156,32 +190,19 @@ def load_Common_Voice():
|
|
| 156 |
sample["text"] = sample["text"].lower()
|
| 157 |
return dataset, text_column
|
| 158 |
|
| 159 |
-
def
|
| 160 |
-
|
| 161 |
-
dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
|
| 162 |
-
|
| 163 |
-
# Optionally, preview the first item to understand the structure (can be removed in production)
|
| 164 |
print(next(iter(dataset)))
|
| 165 |
-
|
| 166 |
-
# Take the first 120 examples to work with
|
| 167 |
-
dataset = dataset.take(N_SAMPLES+20)
|
| 168 |
-
text_column = "normalized_text"
|
| 169 |
-
|
| 170 |
-
# Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
|
| 171 |
-
dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
|
| 172 |
-
|
| 173 |
-
# Take the first 100 examples after filtering
|
| 174 |
dataset = dataset.take(N_SAMPLES)
|
| 175 |
-
|
| 176 |
-
# Cast the 'audio' column to the desired sampling rate
|
| 177 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 178 |
-
|
| 179 |
-
# Convert to list and return
|
| 180 |
dataset = list(dataset)
|
|
|
|
|
|
|
| 181 |
return dataset, text_column
|
| 182 |
|
| 183 |
-
def
|
| 184 |
-
dataset = load_dataset("librispeech_asr", "
|
| 185 |
print(next(iter(dataset)))
|
| 186 |
text_column = "text"
|
| 187 |
dataset = dataset.take(N_SAMPLES)
|
|
@@ -191,22 +212,6 @@ def load_Librispeech_ASR_clean():
|
|
| 191 |
sample["text"] = sample["text"].lower()
|
| 192 |
return dataset, text_column
|
| 193 |
|
| 194 |
-
def is_valid_sample(text, audio):
|
| 195 |
-
# Check if 'normalized_text' is valid
|
| 196 |
-
text = text.strip()
|
| 197 |
-
if text == "" or text == "ignore time segment in scoring":
|
| 198 |
-
return False
|
| 199 |
-
|
| 200 |
-
# Check if the 'audio' array is valid (not empty and meets length criteria)
|
| 201 |
-
if len(audio['array']) == 0: # Audio is empty
|
| 202 |
-
return False
|
| 203 |
-
|
| 204 |
-
# Optionally, check if the audio duration is within a certain range
|
| 205 |
-
duration = audio['array'].size / audio['sampling_rate']
|
| 206 |
-
if duration < 1.0 or duration > 60.0: # Example: Filter out audio shorter than 1 second or longer than 60 seconds
|
| 207 |
-
return False
|
| 208 |
-
|
| 209 |
-
return True
|
| 210 |
|
| 211 |
|
| 212 |
# MODEL LOADERS
|
|
|
|
| 9 |
import torch
|
| 10 |
import numpy as np
|
| 11 |
import pandas as pd
|
| 12 |
+
import time
|
| 13 |
|
| 14 |
N_SAMPLES = 30
|
| 15 |
|
|
|
|
| 26 |
|
| 27 |
if data_subset == "Common Voice":
|
| 28 |
dataset, text_column = load_Common_Voice()
|
|
|
|
|
|
|
| 29 |
elif data_subset == "Librispeech ASR clean":
|
| 30 |
dataset, text_column = load_Librispeech_ASR_clean()
|
| 31 |
+
elif data_subset == "Librispeech ASR other":
|
| 32 |
+
dataset, text_column = load_Librispeech_ASR_other()
|
| 33 |
elif data_subset == "OWN Recoding/Sample":
|
| 34 |
sr, audio = own_audio
|
| 35 |
+
audio = audio.astype(np.float32)
|
| 36 |
print("AUDIO: ", type(audio), audio)
|
| 37 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 38 |
else:
|
|
|
|
| 48 |
|
| 49 |
if data_subset == "OWN Recoding/Sample":
|
| 50 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
| 51 |
+
inference_times1 = []
|
| 52 |
+
inference_times2 = []
|
| 53 |
+
|
| 54 |
+
time_start = time.time()
|
| 55 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
| 56 |
+
time_stop = time.time()
|
| 57 |
+
duration = time_stop - time_start
|
| 58 |
+
inference_times1.append(duration)
|
| 59 |
+
|
| 60 |
+
time_start = time.time()
|
| 61 |
transcription2 = model_compute(model2, processor2, sample, model_2)
|
| 62 |
+
time_stop = time.time()
|
| 63 |
+
duration = time_stop - time_start
|
| 64 |
+
inference_times2.append(duration)
|
| 65 |
|
| 66 |
transcriptions1 = [transcription1]
|
| 67 |
transcriptions2 = [transcription2]
|
| 68 |
+
references = [own_transcription.lower()]
|
| 69 |
|
| 70 |
wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
|
| 71 |
wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
|
|
|
|
| 73 |
results_md = f"""
|
| 74 |
#### {model_1}
|
| 75 |
- WER Score: {wer1}
|
| 76 |
+
- Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
|
| 77 |
|
| 78 |
#### {model_2}
|
| 79 |
+
- WER Score: {wer2}
|
| 80 |
+
- Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
|
| 81 |
|
| 82 |
# Create the bar plot
|
| 83 |
fig = go.Figure(
|
|
|
|
| 104 |
transcriptions2 = []
|
| 105 |
WER1s = []
|
| 106 |
WER2s = []
|
| 107 |
+
inference_times1 = []
|
| 108 |
+
inference_times2 = []
|
| 109 |
|
| 110 |
|
| 111 |
counter = 0
|
|
|
|
| 116 |
references.append(sample[text_column])
|
| 117 |
|
| 118 |
if model_1 == model_2:
|
| 119 |
+
time_start = time.time()
|
| 120 |
transcription = model_compute(model1, processor1, sample, model_1)
|
| 121 |
+
time_stop = time.time()
|
| 122 |
+
duration = time_stop - time_start
|
| 123 |
+
inference_times1.append(duration)
|
| 124 |
+
inference_times2.append(duration)
|
| 125 |
transcriptions1.append(transcription)
|
| 126 |
transcriptions2.append(transcription)
|
| 127 |
else:
|
| 128 |
+
time_start = time.time()
|
| 129 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
| 130 |
+
time_stop = time.time()
|
| 131 |
+
duration = time_stop - time_start
|
| 132 |
+
inference_times1.append(duration)
|
| 133 |
transcriptions1.append(transcription1)
|
| 134 |
+
|
| 135 |
+
time_start = time.time()
|
| 136 |
+
transcription2 = model_compute(model2, processor2, sample, model_2)
|
| 137 |
+
time_stop = time.time()
|
| 138 |
+
duration = time_stop - time_start
|
| 139 |
+
inference_times2.append(duration)
|
| 140 |
transcriptions2.append(transcription2)
|
| 141 |
|
| 142 |
+
WER1s.append(round(compute_wer([sample[text_column]], [transcription1]),4))
|
| 143 |
+
WER2s.append(round(compute_wer([sample[text_column]], [transcription2]),4))
|
| 144 |
+
wer1 = round(sum(WER1s)/len(WER1s), 4)
|
| 145 |
+
wer2 = round(sum(WER2s)/len(WER2s), 4)
|
| 146 |
|
| 147 |
|
| 148 |
results_md = f"""
|
| 149 |
{i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
|
| 150 |
|
| 151 |
#### {model_1}
|
| 152 |
+
- WER Score: {wer1}
|
| 153 |
+
- Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
|
| 154 |
|
| 155 |
#### {model_2}
|
| 156 |
+
- WER Score: {wer2}
|
| 157 |
+
- Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
|
| 158 |
|
| 159 |
# Create the bar plot
|
| 160 |
fig = go.Figure(
|
| 161 |
data=[
|
| 162 |
+
go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
|
| 163 |
+
go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
|
| 164 |
]
|
| 165 |
)
|
| 166 |
|
|
|
|
| 172 |
barmode="group",
|
| 173 |
)
|
| 174 |
|
| 175 |
+
df = pd.DataFrame({"references":references, f"{model_1}":transcriptions1,"WER 1":WER1s,f"{model_2}":transcriptions2,"WER 2":WER2s})
|
| 176 |
|
| 177 |
yield results_md, fig, df
|
| 178 |
|
|
|
|
| 190 |
sample["text"] = sample["text"].lower()
|
| 191 |
return dataset, text_column
|
| 192 |
|
| 193 |
+
def load_Librispeech_ASR_clean():
|
| 194 |
+
dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
| 195 |
print(next(iter(dataset)))
|
| 196 |
+
text_column = "text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
dataset = dataset.take(N_SAMPLES)
|
|
|
|
|
|
|
| 198 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
|
|
|
|
|
|
| 199 |
dataset = list(dataset)
|
| 200 |
+
for sample in dataset:
|
| 201 |
+
sample["text"] = sample["text"].lower()
|
| 202 |
return dataset, text_column
|
| 203 |
|
| 204 |
+
def load_Librispeech_ASR_other():
|
| 205 |
+
dataset = load_dataset("librispeech_asr", "other", split="test", streaming=True, token=True, trust_remote_code=True)
|
| 206 |
print(next(iter(dataset)))
|
| 207 |
text_column = "text"
|
| 208 |
dataset = dataset.take(N_SAMPLES)
|
|
|
|
| 212 |
sample["text"] = sample["text"].lower()
|
| 213 |
return dataset, text_column
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
|
| 217 |
# MODEL LOADERS
|