Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import torch | |
| import gradio as gr | |
| import numpy as np | |
| from transformers import AutoProcessor, CsmForConditionalGeneration | |
| model_id = "Marvis-AI/marvis-tts-250m-v0.1-transformers" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device) | |
| def tts(text: str): | |
| inputs = processor( | |
| text, | |
| add_special_tokens=True, | |
| return_tensors="pt" | |
| ).to(device) | |
| if "token_type_ids" in inputs: | |
| inputs.pop("token_type_ids") | |
| # generate audio | |
| audio = model.generate(**inputs, output_audio=True) | |
| audio_np = audio[0].cpu().numpy() | |
| return (24_000, audio_np) | |
| with gr.Blocks(title="Marvis TTS Demo") as demo: | |
| gr.Markdown("## 🎙️ Marvis TTS Demo\nTry out Marvis TTS with different speakers using `[0]`, `[1]`, etc. before your text!") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text Input", | |
| value="[0] Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices.", | |
| lines=3, | |
| placeholder="Enter text here... (prefix with [0], [1], etc. to choose speaker)" | |
| ) | |
| generate_btn = gr.Button("Generate Speech") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio") | |
| generate_btn.click( | |
| fn=tts, | |
| inputs=text_input, | |
| outputs=audio_output | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |