Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
| 1 |
-
import
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 3 |
from diffusers import DiffusionPipeline
|
| 4 |
import torch
|
| 5 |
-
import accelerate
|
| 6 |
|
| 7 |
# Load the models and tokenizers
|
| 8 |
translation_model_name = "google/madlad400-3b-mt"
|
|
@@ -15,7 +14,7 @@ diffusion_model_name = "stabilityai/stable-diffusion-xl-base-1.0"
|
|
| 15 |
diffusion_pipeline = DiffusionPipeline.from_pretrained(diffusion_model_name, torch_dtype=torch.float16)
|
| 16 |
diffusion_pipeline = diffusion_pipeline.to("cuda")
|
| 17 |
|
| 18 |
-
# Define the translation and transcription pipeline
|
| 19 |
translation_pipeline = pipeline("translation", model=translation_model, tokenizer=translation_tokenizer, device_map="auto")
|
| 20 |
transcription_pipeline = pipeline("automatic-speech-recognition", model=transcription_model, device_map="auto")
|
| 21 |
|
|
@@ -28,22 +27,28 @@ def transcribe_and_translate_audio_fon(audio_path, num_images=1):
|
|
| 28 |
translation_result = translation_pipeline(transcription_fon, source_lang="fon", target_lang="fr")
|
| 29 |
translation_fr = translation_result[0]["translation_text"]
|
| 30 |
|
|
|
|
| 31 |
images = diffusion_pipeline(translation_fr, num_images_per_prompt=num_images)["images"]
|
| 32 |
|
| 33 |
return images
|
| 34 |
|
| 35 |
-
# Create a
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
audio_file = st.file_uploader("Upload an audio file", type=["wav"])
|
| 40 |
-
|
| 41 |
-
# Transcribe, translate and generate images
|
| 42 |
-
if audio_file:
|
| 43 |
-
images = transcribe_and_translate_audio_fon(audio_file)
|
| 44 |
-
st.image(images[0])
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# Use Accelerate to distribute the computation across available GPUs
|
| 48 |
-
#images = accelerate.launch(transcribe_and_translate_and_generate, audio_file="Fongbe_Speech_Dataset/Fongbe_Speech_Dataset/fongbe_speech_audio_files/wav/64_fongbe_6b36d45b77344caeb1c8d773303c9dcb_for_validation_2022-03-11-23-50-13.wav", num_images=2)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 3 |
from diffusers import DiffusionPipeline
|
| 4 |
import torch
|
|
|
|
| 5 |
|
| 6 |
# Load the models and tokenizers
|
| 7 |
translation_model_name = "google/madlad400-3b-mt"
|
|
|
|
| 14 |
diffusion_pipeline = DiffusionPipeline.from_pretrained(diffusion_model_name, torch_dtype=torch.float16)
|
| 15 |
diffusion_pipeline = diffusion_pipeline.to("cuda")
|
| 16 |
|
| 17 |
+
# Define the translation and transcription pipeline
|
| 18 |
translation_pipeline = pipeline("translation", model=translation_model, tokenizer=translation_tokenizer, device_map="auto")
|
| 19 |
transcription_pipeline = pipeline("automatic-speech-recognition", model=transcription_model, device_map="auto")
|
| 20 |
|
|
|
|
| 27 |
translation_result = translation_pipeline(transcription_fon, source_lang="fon", target_lang="fr")
|
| 28 |
translation_fr = translation_result[0]["translation_text"]
|
| 29 |
|
| 30 |
+
# Generate images based on the French translation using the diffusion model
|
| 31 |
images = diffusion_pipeline(translation_fr, num_images_per_prompt=num_images)["images"]
|
| 32 |
|
| 33 |
return images
|
| 34 |
|
| 35 |
+
# Create a Gradio interface
|
| 36 |
+
def process_audio(audio, num_images):
|
| 37 |
+
images = transcribe_and_translate_audio_fon(audio, num_images)
|
| 38 |
+
return images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
# Define Gradio interface components
|
| 41 |
+
audio_input = gr.Audio(source="upload", type="filepath", label="Upload an audio file")
|
| 42 |
+
image_output = gr.Gallery(label="Generated Images").style(grid=2)
|
| 43 |
+
num_images_input = gr.Slider(minimum=1, maximum=5, step=1, value=1, label="Number of Images")
|
| 44 |
+
|
| 45 |
+
# Launch Gradio interface
|
| 46 |
+
interface = gr.Interface(
|
| 47 |
+
fn=process_audio,
|
| 48 |
+
inputs=[audio_input, num_images_input],
|
| 49 |
+
outputs=image_output,
|
| 50 |
+
title="Fon Audio to Image Translation",
|
| 51 |
+
description="Upload an audio file in Fon, and the app will transcribe, translate to French, and generate related images."
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
interface.launch()
|