j-tobias
commited on
Commit
·
db6e0bb
1
Parent(s):
15f66cd
added new model
Browse files- app.py +2 -2
- cards.txt +11 -0
- processing.py +11 -0
app.py
CHANGED
|
@@ -26,7 +26,7 @@ login(hf_token)
|
|
| 26 |
|
| 27 |
|
| 28 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
| 29 |
-
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
| 30 |
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
|
| 31 |
|
| 32 |
# HELPER FUNCTIONS
|
|
@@ -43,7 +43,7 @@ def get_card(selected_model:str)->str:
|
|
| 43 |
if "ID: "+selected_model in card:
|
| 44 |
return card
|
| 45 |
|
| 46 |
-
return "Unknown Model"
|
| 47 |
|
| 48 |
def is_own(selected_option):
|
| 49 |
"""
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
| 29 |
+
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2","facebook/hf-seamless-m4t-medium"]
|
| 30 |
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
|
| 31 |
|
| 32 |
# HELPER FUNCTIONS
|
|
|
|
| 43 |
if "ID: "+selected_model in card:
|
| 44 |
return card
|
| 45 |
|
| 46 |
+
return "## Unknown Model"
|
| 47 |
|
| 48 |
def is_own(selected_option):
|
| 49 |
"""
|
cards.txt
CHANGED
|
@@ -34,4 +34,15 @@
|
|
| 34 |
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
| 35 |
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
(evaluating this model might take a while due to it's size)
|
|
|
|
| 34 |
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
| 35 |
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
| 36 |
|
| 37 |
+
(evaluating this model might take a while due to it's size)
|
| 38 |
+
@@
|
| 39 |
+
#### HF Seamless M4T Medium
|
| 40 |
+
- ID: facebook/hf-seamless-m4t-medium
|
| 41 |
+
- Hugging Face: [model](https://huggingface.co/facebook/hf-seamless-m4t-medium)
|
| 42 |
+
- Creator: facebook
|
| 43 |
+
- Finetuned: No
|
| 44 |
+
- Model Size: 1.2 B Parameters
|
| 45 |
+
- Model Paper: [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf)
|
| 46 |
+
- Training Data: ?
|
| 47 |
+
|
| 48 |
(evaluating this model might take a while due to it's size)
|
processing.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
| 3 |
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
| 4 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
|
|
|
| 5 |
|
| 6 |
# Import Libraries to access Datasets
|
| 7 |
from datasets import load_dataset
|
|
@@ -251,6 +252,9 @@ def load_model(model_id:str):
|
|
| 251 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
| 252 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
| 253 |
model.config.forced_decoder_ids = None
|
|
|
|
|
|
|
|
|
|
| 254 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
| 255 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
| 256 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
|
@@ -291,6 +295,13 @@ def model_compute(model, processor, sample, model_id):
|
|
| 291 |
transcription = processor.tokenizer.normalize(transcription[0])
|
| 292 |
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
| 293 |
return transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
| 295 |
sample = sample["audio"]
|
| 296 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|
|
|
|
| 2 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
| 3 |
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
| 4 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
| 5 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
| 6 |
|
| 7 |
# Import Libraries to access Datasets
|
| 8 |
from datasets import load_dataset
|
|
|
|
| 252 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
| 253 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
| 254 |
model.config.forced_decoder_ids = None
|
| 255 |
+
elif model_id == "facebook/hf-seamless-m4t-medium":
|
| 256 |
+
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
| 257 |
+
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
| 258 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
| 259 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
| 260 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
|
|
|
| 295 |
transcription = processor.tokenizer.normalize(transcription[0])
|
| 296 |
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
| 297 |
return transcription
|
| 298 |
+
elif model_id == "facebook/hf-seamless-m4t-medium":
|
| 299 |
+
sample = sample["audio"]
|
| 300 |
+
input_data = processor(audios=sample["array"], return_tensors="pt")
|
| 301 |
+
output_tokens = model.generate(**input_data, tgt_lang="eng", generate_speech=False)
|
| 302 |
+
print(output_tokens)
|
| 303 |
+
transcription = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
| 304 |
+
return transcription
|
| 305 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
| 306 |
sample = sample["audio"]
|
| 307 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|