Kokoro-TTS / app.py
Pendrokar's picture
Lojban second in order
a26a44c verified
raw
history blame
12.8 kB
import spaces
from kokoro import KModel, KPipeline
import gradio as gr
import os
import random
import torch
import re
IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
CUDA_AVAILABLE = torch.cuda.is_available()
if not IS_DUPLICATE:
import kokoro
import misaki
print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)
try:
import phonemizer
global_phonemizer_en = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
PHONEMIZER_AVAILABLE = True
global_phonemizer_en = None
except ImportError:
PHONEMIZER_AVAILABLE = False
CHAR_LIMIT = None if IS_DUPLICATE else 5000
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
# Language mappings similar to StyleTTS
LANG_NAMES = {
'en-us': 'english',
'cs': 'czech',
'da': 'danish',
'nl': 'dutch',
'et': 'estonian',
'fi': 'finnish',
'fr': 'french',
'de': 'german',
'el': 'greek',
'it': 'italian',
'no': 'norwegian',
'pl': 'polish',
'pt': 'portuguese',
'ru': 'russian',
'sl': 'slovene',
'es': 'spanish',
'sv': 'swedish',
'tr': 'turkish',
}
def text_to_ipa(text, lang='en-us'):
"""Convert text to IPA using phonemizer or return original text"""
if not PHONEMIZER_AVAILABLE:
return text
try:
# Handle IPA sections within brackets
regex = r"\([^\]]*\)[[^\]]*\]"
ipa_sections = re.findall(regex, text)
print(text)
text = re.sub(regex, '()[]', text)
print(text)
if lang == 'jb':
# Lojban language
import lojban
ps = f'[{text}](/'+ lojban.lojban2ipa(text, 'vits') +'/)'
elif lang in LANG_NAMES:
local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
ps = local_phonemizer.phonemize([text])
ps = f'[{text}](/'+ ps[0] +'/)'
else:
ps = text
print(ps)
# Add back IPA sections
for ipa in ipa_sections:
ps = ps.replace('( )[ ]', ipa, 1)
print(ps)
return ps
except Exception as e:
print(f"Phonemizer error: {e}")
return text
@spaces.GPU(duration=30)
def forward_gpu(ps, ref_s, speed):
return models[True](ps, ref_s, speed)
def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE, lang='en-us'):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
# Convert text to IPA if not English
if lang != 'en-us':
text = text_to_ipa(text, lang)
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
use_gpu = use_gpu and CUDA_AVAILABLE
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
try:
if use_gpu:
audio = forward_gpu(ps, ref_s, speed)
else:
audio = models[False](ps, ref_s, speed)
except gr.exceptions.Error as e:
if use_gpu:
gr.Warning(str(e))
gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.')
audio = models[False](ps, ref_s, speed)
else:
raise gr.Error(e)
return (24000, audio.numpy()), ps
return None, ''
# Arena API
def predict(text, voice='af_heart', speed=1):
""" Convert the text into speech using Kokoro American English and British English voice models.
Args:
text: string; accepts IPA within ()[] brackets
voice: Literal['af_heart', 'af_bella', 'af_nicole', 'af_aoede', 'af_kore', 'af_sarah', 'af_nova', 'af_sky', 'af_alloy', 'af_jessica', 'af_river', 'am_michael', 'am_fenrir', 'am_puck', 'am_echo', 'am_eric', 'am_liam', 'am_onyx', 'am_santa', 'am_adam', 'bf_emma', 'bf_isabella', 'bf_alice', 'bf_lily', 'bm_george', 'bm_fable', 'bm_lewis', 'bm_daniel']; voice model
lang: Literal['en-us', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'it', 'no', 'pl', 'pt', 'ru', 'sl', 'es', 'sv', 'tr', 'jb']; ISO 639-1 code for the text language; 'jb' is a valid code for Lojban
speed: talkback speed; 0.5-2
Returns: Tuple of (output_audio_path, ipa_results) where output_audio_path is the filepath of output audio
"""
return generate_first(text, voice, speed, use_gpu=False)[0]
def tokenize_first(text, voice='af_heart', lang='en-us'):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
# Convert text to IPA if not English or if language is specified
if lang != 'en-us':
text = text_to_ipa(text, lang)
pipeline = pipelines[voice[0]]
for _, ps, _ in pipeline(text, voice):
return ps
return ''
def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE, lang='en-us'):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
# Convert text to IPA if not English or if language is specified
if lang != 'en-us':
text = text_to_ipa(text, lang)
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
use_gpu = use_gpu and CUDA_AVAILABLE
first = True
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
try:
if use_gpu:
audio = forward_gpu(ps, ref_s, speed)
else:
audio = models[False](ps, ref_s, speed)
except gr.exceptions.Error as e:
if use_gpu:
gr.Warning(str(e))
gr.Info('Switching to CPU')
audio = models[False](ps, ref_s, speed)
else:
raise gr.Error(e)
yield 24000, audio.numpy()
if first:
first = False
yield 24000, torch.zeros(1).numpy()
with open('en.txt', 'r') as r:
random_quotes = [line.strip() for line in r]
def get_random_quote():
return random.choice(random_quotes)
def get_gatsby():
with open('gatsby5k.md', 'r') as r:
return r.read().strip()
def get_frankenstein():
with open('frankenstein5k.md', 'r') as r:
return r.read().strip()
CHOICES = {
'🇺🇸 🚺 Heart ❤️': 'af_heart',
'🇺🇸 🚺 Bella 🔥': 'af_bella',
'🇺🇸 🚺 Nicole 🎧': 'af_nicole',
'🇺🇸 🚺 Aoede': 'af_aoede',
'🇺🇸 🚺 Kore': 'af_kore',
'🇺🇸 🚺 Sarah': 'af_sarah',
'🇺🇸 🚺 Nova': 'af_nova',
'🇺🇸 🚺 Sky': 'af_sky',
'🇺🇸 🚺 Alloy': 'af_alloy',
'🇺🇸 🚺 Jessica': 'af_jessica',
'🇺🇸 🚺 River': 'af_river',
'🇺🇸 🚹 Michael': 'am_michael',
'🇺🇸 🚹 Fenrir': 'am_fenrir',
'🇺🇸 🚹 Puck': 'am_puck',
'🇺🇸 🚹 Echo': 'am_echo',
'🇺🇸 🚹 Eric': 'am_eric',
'🇺🇸 🚹 Liam': 'am_liam',
'🇺🇸 🚹 Onyx': 'am_onyx',
'🇺🇸 🚹 Santa': 'am_santa',
'🇺🇸 🚹 Adam': 'am_adam',
'🇬🇧 🚺 Emma': 'bf_emma',
'🇬🇧 🚺 Isabella': 'bf_isabella',
'🇬🇧 🚺 Alice': 'bf_alice',
'🇬🇧 🚺 Lily': 'bf_lily',
'🇬🇧 🚹 George': 'bm_george',
'🇬🇧 🚹 Fable': 'bm_fable',
'🇬🇧 🚹 Lewis': 'bm_lewis',
'🇬🇧 🚹 Daniel': 'bm_daniel',
}
for v in CHOICES.values():
pipelines[v[0]].load_voice(v)
TOKEN_NOTE = '''
💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
💬 To adjust intonation, try punctuation `;:,.!?—…"()“”` or stress `ˈ` and `ˌ`
⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
'''
with gr.Blocks() as generate_tab:
out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
generate_btn = gr.Button('Generate', variant='primary')
with gr.Accordion('Output Tokens', open=True):
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
tokenize_btn = gr.Button('Tokenize', variant='secondary')
gr.Markdown(TOKEN_NOTE)
predict_btn = gr.Button('Predict', variant='secondary', visible=False)
STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
if CHAR_LIMIT is not None:
STREAM_NOTE.append(f'✂️ Each stream is capped at {CHAR_LIMIT} characters.')
STREAM_NOTE.append('🚀 Want more characters? You can [use Kokoro directly](https://huggingface.co/hexgrad/Kokoro-82M#usage) or duplicate this space:')
STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
with gr.Blocks() as stream_tab:
out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
with gr.Row():
stream_btn = gr.Button('Stream', variant='primary')
stop_btn = gr.Button('Stop', variant='stop')
with gr.Accordion('Note', open=True):
gr.Markdown(STREAM_NOTE)
gr.DuplicateButton()
BANNER_TEXT = '''
[***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
This demo uses native US English and British English speakers. But also supports multiple languages using G2P and phonemizer. Select your language below!
'''
API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
API_NAME = None if API_OPEN else False
# Language choices for the dropdown
LANGUAGE_CHOICES = [
['English (US)', 'en-us'],
['Lojban', 'jb'],
['Czech (Non-native)', 'cs'],
['Danish (Non-native)', 'da'],
['Dutch (Non-native)', 'nl'],
['Estonian (Non-native)', 'et'],
['Finnish (Non-native)', 'fi'],
['French (Non-native)', 'fr'],
['German (Non-native)', 'de'],
['Greek (Non-native)', 'el'],
['Italian (Non-native)', 'it'],
['Norwegian (Non-native)', 'no'],
['Polish (Non-native)', 'pl'],
['Portuguese (Non-native)', 'pt'],
['Russian (Non-native)', 'ru'],
['Slovene (Non-native)', 'sl'],
['Spanish (Non-native)', 'es'],
['Swedish (Non-native)', 'sv'],
['Turkish (Non-native)', 'tr'],
]
with gr.Blocks() as app:
with gr.Row():
gr.Markdown(BANNER_TEXT, container=True)
with gr.Row():
with gr.Column():
text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream. Supports IPA within (<text>)[<IPA>] parethesis and brackets.")
with gr.Row():
voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Quality and availability vary by language')
use_gpu = gr.Dropdown(
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
value=CUDA_AVAILABLE,
label='Hardware',
info='GPU is usually faster, but has a usage quota',
interactive=CUDA_AVAILABLE
)
with gr.Row():
lang = gr.Dropdown(
LANGUAGE_CHOICES,
value='en-us',
label="Language",
info="Select language for G2P processing"
)
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
random_btn = gr.Button('🎲 Random Quote 💬', variant='secondary')
with gr.Row():
gatsby_btn = gr.Button('🥂 Gatsby 📕', variant='secondary')
frankenstein_btn = gr.Button('💀 Frankenstein 📗', variant='secondary')
with gr.Column():
gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
random_btn.click(fn=get_random_quote, inputs=[], outputs=[text], api_name=API_NAME)
gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text], api_name=API_NAME)
frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text], api_name=API_NAME)
generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu, lang], outputs=[out_audio, out_ps], api_name=API_NAME)
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice, lang], outputs=[out_ps], api_name=API_NAME)
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu, lang], outputs=[out_stream], api_name=API_NAME)
stop_btn.click(fn=None, cancels=stream_event)
predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
if __name__ == '__main__':
app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True, mcp_server=API_OPEN)