VoiceMM / app.py
Madras1's picture
Upload 4 files
1a65865 verified
from __future__ import annotations
import tempfile
from functools import lru_cache
from pathlib import Path
import gradio as gr
import numpy as np
import soundfile as sf
from kokoro import KPipeline
SPACE_TITLE = "VoiceMM TTS API"
SAMPLE_RATE = 24_000
MAX_CHARS = 450
VOICE_OPTIONS = {
"pf_dora": "Dora, feminina e clara",
"pm_alex": "Alex, masculina e neutra",
"pm_santa": "Santa, masculina e encorpada",
}
EXAMPLES = [
[
"Seu produto ficou pronto. Agora ele tem uma voz que passa confianca, ritmo e presenca.",
"pf_dora",
1.0,
],
[
"Apresente sua startup em vinte segundos: problema, promessa e chamada para acao.",
"pm_alex",
1.05,
],
[
"Bem-vindo ao VoiceMM. Transforme roteiro em audio com uma interface simples e bonita.",
"pm_santa",
0.95,
],
]
CSS = """
.gradio-container {
background:
radial-gradient(circle at top left, rgba(237, 180, 93, 0.18), transparent 30%),
radial-gradient(circle at top right, rgba(33, 181, 168, 0.12), transparent 28%),
#0f1518;
}
.voicelek-shell {
max-width: 1024px;
margin: 0 auto;
}
.voicelek-kicker {
letter-spacing: 0.18em;
text-transform: uppercase;
color: #efbf74;
font-size: 0.8rem;
}
"""
@lru_cache(maxsize=8)
def get_pipeline(lang_code: str) -> KPipeline:
return KPipeline(lang_code=lang_code)
def normalize_text(text: str) -> str:
cleaned = " ".join((text or "").split())
if not cleaned:
raise gr.Error("Digite algum texto antes de gerar o audio.")
if len(cleaned) > MAX_CHARS:
raise gr.Error(
f"Use no maximo {MAX_CHARS} caracteres por vez para manter a latencia boa no plano gratis."
)
return cleaned
def synthesize(text: str, voice: str, speed: float) -> tuple[str, str]:
cleaned = normalize_text(text)
pipeline = get_pipeline(voice[0])
chunks: list[np.ndarray] = []
for _, _, audio in pipeline(cleaned, voice=voice, speed=float(speed)):
chunks.append(np.asarray(audio, dtype=np.float32))
if not chunks:
raise gr.Error("O modelo nao conseguiu gerar audio para esse texto.")
waveform = np.concatenate(chunks)
output_dir = Path(tempfile.mkdtemp(prefix="voicelek_"))
output_path = output_dir / "voicelek-output.wav"
sf.write(output_path, waveform, SAMPLE_RATE)
duration_seconds = len(waveform) / SAMPLE_RATE
details = (
f"**Voz:** {VOICE_OPTIONS[voice]} \n"
f"**Velocidade:** {speed:.2f}x \n"
f"**Entrada:** {len(cleaned)} caracteres \n"
f"**Duracao estimada:** {duration_seconds:.1f}s"
)
return str(output_path), details
with gr.Blocks(title=SPACE_TITLE) as demo:
with gr.Column(elem_classes="voicelek-shell"):
gr.Markdown(
"""
<div class="voicelek-kicker">VoiceMM</div>
# API de TTS em portugues brasileiro
Esta Space foi pensada para ser o backend de um frontend estatico no GitHub Pages.
O endpoint publico principal e `"/synthesize"`.
""",
)
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Texto",
lines=8,
max_lines=12,
placeholder="Cole aqui sua copy, roteiro, CTA ou locucao curta.",
value=EXAMPLES[0][0],
)
with gr.Column(scale=2):
voice_input = gr.Dropdown(
choices=[(label, key) for key, label in VOICE_OPTIONS.items()],
value="pf_dora",
label="Voz",
)
speed_input = gr.Slider(
minimum=0.8,
maximum=1.25,
value=1.0,
step=0.05,
label="Velocidade",
)
generate_button = gr.Button("Gerar audio", variant="primary")
audio_output = gr.Audio(
label="Saida",
type="filepath",
format="wav",
)
details_output = gr.Markdown(
value="Pronto para receber chamadas via navegador ou direto pela API do Gradio."
)
gr.Examples(
examples=EXAMPLES,
inputs=[text_input, voice_input, speed_input],
label="Exemplos rapidos",
)
generate_button.click(
fn=synthesize,
inputs=[text_input, voice_input, speed_input],
outputs=[audio_output, details_output],
api_name="synthesize",
)
demo.queue(default_concurrency_limit=1, max_size=16)
if __name__ == "__main__":
demo.launch(
theme=gr.themes.Soft(
primary_hue="amber",
secondary_hue="teal",
neutral_hue="slate",
),
css=CSS,
footer_links=["api", "gradio", "settings"],
)