Spaces:
Runtime error
Runtime error
File size: 3,881 Bytes
7e66c78 6137274 7e66c78 bbea18f 7e66c78 278ac29 7e66c78 278ac29 7e66c78 6137274 7e66c78 278ac29 7e66c78 96b0eae 7e66c78 96b0eae 7e66c78 0f7c3d6 7e66c78 bf7fc52 7e66c78 bf7fc52 7e66c78 bf7fc52 7e66c78 96b0eae 7e66c78 bbea18f 7e66c78 7a56e38 f0ac9ac 2f7bc9c 7e66c78 f0ac9ac 7e66c78 bbea18f 7e66c78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import torch
import numpy as np
from soprano import SopranoTTS
from scipy.io.wavfile import write as wav_write
import tempfile
import os
import spaces
assert torch.cuda.is_available(), "Demo requires a GPU."
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)
model = None
def load_model():
global model
if model is None:
# Load model once
model = SopranoTTS(
backend="auto",
device=DEVICE,
cache_size_mb=100,
decoder_batch_size=1,
)
return model
SAMPLE_RATE = 32000
@spaces.GPU
def tts_stream(text, temperature, top_p, repetition_penalty, state):
model = load_model()
if not text.strip():
yield None, state
return
out = model.infer(
text,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
)
audio_np = out.cpu().numpy()
yield (SAMPLE_RATE, audio_np), audio_np
def save_audio(state):
if state is None or len(state) == 0:
return None
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
wav_write(path, SAMPLE_RATE, state)
return path
with gr.Blocks() as demo:
state_audio = gr.State(None)
with gr.Row():
with gr.Column():
gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
text_in = gr.Textbox(
label="Input Text",
placeholder="Enter text to synthesize...",
value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
lines=4,
)
with gr.Accordion("Advanced options", open=False):
temperature = gr.Slider(
0.0, 1.0, value=0.3, step=0.05, label="Temperature"
)
top_p = gr.Slider(
0.0, 1.0, value=0.95, step=0.01, label="Top-p"
)
repetition_penalty = gr.Slider(
1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
)
gen_btn = gr.Button("Generate")
with gr.Column():
audio_out = gr.Audio(
label="Output Audio",
autoplay=True,
streaming=False,
)
#download_btn = gr.Button("Download")
#file_out = gr.File(label="Download file")
gr.Markdown(
"Usage tips:\n\n"
"- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
"- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
"- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n"
"- Avoid improper grammar such as not using contractions, multiple spaces, etc."
)
gen_btn.click(
fn=tts_stream,
inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
outputs=[audio_out, state_audio],
)
#download_btn.click(
# fn=save_audio,
# inputs=[state_audio],
# outputs=[file_out],
#)
demo.queue()
demo.launch() |