Spaces:

justus-tobias
/

ASR_Model_Comparison

Paused

App Files Files Community

j-tobias commited on Aug 18, 2024

Commit

db6e0bb

1 Parent(s): 15f66cd

added new model

Browse files

Files changed (3) hide show

app.py +2 -2
cards.txt +11 -0
processing.py +11 -0

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ login(hf_token)
 # GENERAL OPTIONS FOR MODELS AND DATASETS
-MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
 DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
 # HELPER FUNCTIONS
@@ -43,7 +43,7 @@ def get_card(selected_model:str)->str:
         if "ID: "+selected_model in card:
             return card
-    return "Unknown Model"
 def is_own(selected_option):
     """

 # GENERAL OPTIONS FOR MODELS AND DATASETS
+MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2","facebook/hf-seamless-m4t-medium"]
 DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
 # HELPER FUNCTIONS
         if "ID: "+selected_model in card:
             return card
+    return "## Unknown Model"
 def is_own(selected_option):
     """

cards.txt CHANGED Viewed

@@ -34,4 +34,15 @@
 - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
 - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
 (evaluating this model might take a while due to it's size)

 - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
 - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
+(evaluating this model might take a while due to it's size)
+@@
+#### HF Seamless M4T Medium
+- ID: facebook/hf-seamless-m4t-medium
+- Hugging Face: [model](https://huggingface.co/facebook/hf-seamless-m4t-medium)
+- Creator: facebook
+- Finetuned: No
+- Model Size: 1.2 B Parameters
+- Model Paper: [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf)
+- Training Data: ?
 (evaluating this model might take a while due to it's size)

processing.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 # Import Libraries to access Datasets
 from datasets import load_dataset
@@ -251,6 +252,9 @@ def load_model(model_id:str):
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
         model.config.forced_decoder_ids = None
     else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -291,6 +295,13 @@ def model_compute(model, processor, sample, model_id):
         transcription = processor.tokenizer.normalize(transcription[0])
         print("TRANSCRIPTION Whisper Large v2: ", transcription)
         return transcription
     else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
         sample = sample["audio"]
         input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features

 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from transformers import AutoProcessor, SeamlessM4TModel
 # Import Libraries to access Datasets
 from datasets import load_dataset
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
         model.config.forced_decoder_ids = None
+    elif model_id == "facebook/hf-seamless-m4t-medium":
+        processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
+        model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
     else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         transcription = processor.tokenizer.normalize(transcription[0])
         print("TRANSCRIPTION Whisper Large v2: ", transcription)
         return transcription
+    elif model_id == "facebook/hf-seamless-m4t-medium":
+        sample = sample["audio"]
+        input_data = processor(audios=sample["array"], return_tensors="pt")
+        output_tokens = model.generate(**input_data, tgt_lang="eng", generate_speech=False)
+        print(output_tokens)
+        transcription = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
+        return transcription
     else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
         sample = sample["audio"]
         input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features