Spaces:
Runtime error
Runtime error
File size: 14,624 Bytes
41b5e7a e9b69d2 90c8f38 41b5e7a d989475 2af00d7 0464d16 2af00d7 6a032c7 2af00d7 90c8f38 47584e8 90c8f38 c5e8bdd 9f9c4e7 2af00d7 41b5e7a d989475 90c8f38 c26c08b 90c8f38 c26c08b 996b452 90c8f38 da2afa9 47584e8 da2afa9 90c8f38 47584e8 90c8f38 996b452 9d153d7 90c8f38 c26c08b 90c8f38 efe2f24 41b5e7a 90c8f38 e8c4059 90c8f38 996b452 90c8f38 412be8a 90c8f38 41b5e7a d989475 41b5e7a 6313334 c26c08b bfb40ed 41b5e7a 90c8f38 41b5e7a 90c8f38 e8c4059 90c8f38 41b5e7a fe296f5 41b5e7a fe296f5 c8ab947 74de460 e9b69d2 74de460 e9b69d2 85474b3 c5e8bdd c1057fc 41b5e7a c1057fc 41b5e7a c1057fc 41b5e7a c1057fc 41b5e7a c1057fc 41b5e7a b7cfba0 41b5e7a b7cfba0 41b5e7a d973331 da2afa9 2dbd805 41b5e7a 89c0cd0 41b5e7a 015d0ec 41b5e7a d973331 41b5e7a e8c4059 41b5e7a b7cfba0 41b5e7a 7a398c4 a6d6b01 3759d07 a6d6b01 da2afa9 a6d6b01 90c8f38 e8c4059 90c8f38 a26a44c ffe17d7 90c8f38 41b5e7a b79287b a6d6b01 7c600ad 47584e8 7b74eba 41b5e7a 7b74eba a385437 7b74eba 41b5e7a 7b74eba 95010a8 2b128b8 c5e8bdd 90c8f38 85474b3 a8dd933 aef4989 7c600ad 41b5e7a 74de460 85474b3 90c8f38 41b5e7a e8c4059 41b5e7a e8c4059 b1e1f06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 |
import spaces
from kokoro import KModel, KPipeline
import gradio as gr
import os
import random
import torch
import re
IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
CUDA_AVAILABLE = torch.cuda.is_available()
if not IS_DUPLICATE:
import kokoro
import misaki
print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)
try:
import phonemizer
global_phonemizer_en = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
PHONEMIZER_AVAILABLE = True
global_phonemizer_en = None
except ImportError:
PHONEMIZER_AVAILABLE = False
try:
from pygoruut.pygoruut import Pygoruut, PygoruutLanguages
pygoruut = Pygoruut()
goruut_langs = PygoruutLanguages()
# global_phonemizer_en = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
PYGORUUT_AVAILABLE = True
except ImportError:
PYGORUUT_AVAILABLE = False
#todo
PYGORUUT_AVAILABLE = False
CHAR_LIMIT = None if IS_DUPLICATE else 5000
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
def text_to_ipa(text, lang='en-us'):
"""Convert text to IPA using phonemizer or return original text"""
if not PHONEMIZER_AVAILABLE:
return text
try:
# Handle IPA sections within brackets
regex = r"\([^\]]*\)[[^\]]*\]"
ipa_sections = re.findall(regex, text)
print(text)
text = re.sub(regex, '()[]', text)
print(text)
if lang == 'jb':
# Lojban language
import lojban
ps = f'[{text}](/'+ lojban.lojban2ipa(text, 'vits') +'/)'
elif lang in LANG_NAMES:
local_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True)
ps = local_phonemizer.phonemize([text])
ps = f'[{text}](/'+ ps[0] +'/)'
else:
ps = text
print(ps)
# Add back IPA sections
for ipa in ipa_sections:
ps = ps.replace('( )[ ]', ipa, 1)
print(ps)
return ps
except Exception as e:
print(f"Phonemizer error: {e}")
return text
@spaces.GPU(duration=30)
def forward_gpu(ps, ref_s, speed):
return models[True](ps, ref_s, speed)
def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE, lang='en-us'):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
# Convert text to IPA if not English
if lang != 'en-us':
if (PYGORUUT_AVAILABLE):
text = goruut_phonemize(text, lang, False, False)
else:
text = text_to_ipa(text, lang)
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
use_gpu = use_gpu and CUDA_AVAILABLE
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
try:
if use_gpu:
audio = forward_gpu(ps, ref_s, speed)
else:
audio = models[False](ps, ref_s, speed)
except gr.exceptions.Error as e:
if use_gpu:
gr.Warning(str(e))
gr.Info('Retrying with CPU. To avoid this error, change Hardware to CPU.')
audio = models[False](ps, ref_s, speed)
else:
raise gr.Error(e)
return (24000, audio.numpy()), ps
return None, ''
# Arena API
def predict(text, voice='af_heart', speed=1):
""" Convert the text into speech using Kokoro American English and British English voice models.
Args:
text: string; accepts IPA within ()[] brackets
voice: Literal['af_heart', 'af_bella', 'af_nicole', 'af_aoede', 'af_kore', 'af_sarah', 'af_nova', 'af_sky', 'af_alloy', 'af_jessica', 'af_river', 'am_michael', 'am_fenrir', 'am_puck', 'am_echo', 'am_eric', 'am_liam', 'am_onyx', 'am_santa', 'am_adam', 'bf_emma', 'bf_isabella', 'bf_alice', 'bf_lily', 'bm_george', 'bm_fable', 'bm_lewis', 'bm_daniel']; voice model
lang: Literal['en-us', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'it', 'no', 'pl', 'pt', 'ru', 'sl', 'es', 'sv', 'tr', 'jb']; ISO 639-1 code for the text language; 'jb' is a valid code for Lojban
speed: talkback speed; 0.5-2
Returns: Tuple of (output_audio_path, ipa_results) where output_audio_path is the filepath of output audio
"""
return generate_first(text, voice, speed, use_gpu=False)[0]
def tokenize_first(text, voice='af_heart', lang='en-us'):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
# Convert text to IPA if not English or if language is specified
if lang != 'en-us':
text = text_to_ipa(text, lang)
pipeline = pipelines[voice[0]]
for _, ps, _ in pipeline(text, voice):
return ps
return ''
def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE, lang='en-us'):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
# Convert text to IPA if not English or if language is specified
if lang != 'en-us':
text = text_to_ipa(text, lang)
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
use_gpu = use_gpu and CUDA_AVAILABLE
first = True
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
try:
if use_gpu:
audio = forward_gpu(ps, ref_s, speed)
else:
audio = models[False](ps, ref_s, speed)
except gr.exceptions.Error as e:
if use_gpu:
gr.Warning(str(e))
gr.Info('Switching to CPU')
audio = models[False](ps, ref_s, speed)
else:
raise gr.Error(e)
yield 24000, audio.numpy()
if first:
first = False
yield 24000, torch.zeros(1).numpy()
with open('en.txt', 'r') as r:
random_quotes = [line.strip() for line in r]
def get_random_quote():
return random.choice(random_quotes)
def get_gatsby():
with open('gatsby5k.md', 'r') as r:
return r.read().strip()
def get_frankenstein():
with open('frankenstein5k.md', 'r') as r:
return r.read().strip()
def filter_languages(search_text, selected_languages):
all_languages = languages.get_all_supported_languages()
# Extract last entry from search input
search_terms = search_text.replace(",,", ",").split(",") if search_text else []
last_term = search_terms[-1] if search_terms else ""
# Filter available languages
filtered = [lang for lang in all_languages if last_term == "" or (last_term.lower() in lang.lower())]
# If no results, show a message instead
if not filtered:
filtered = ["No match found..."]
else:
filtered = [filtered[0] + "..."] + filtered
return gr.update(choices=filtered), filtered[0] # Keep dropdown open and selectable
def dephon_offline(txt, language_tag, is_reverse, is_punct):
try:
response = pygoruut.phonemize(language=language_tag, sentence=txt, is_reverse=is_reverse)
except TypeError:
return ''
if not response or not response.Words:
return ''
if is_punct:
phonetic_line = str(response)
else:
phonetic_line = " ".join(word.Phonetic for word in response.Words)
return phonetic_line
def goruut_phonemize(sentence, language, is_reverse, is_punct):
return dephon_offline(sentence, language.strip(","), is_reverse, is_punct)
CHOICES = {
'🇺🇸 🚺 Heart ❤️': 'af_heart',
'🇺🇸 🚺 Bella 🔥': 'af_bella',
'🇺🇸 🚺 Nicole 🎧': 'af_nicole',
'🇺🇸 🚺 Aoede': 'af_aoede',
'🇺🇸 🚺 Kore': 'af_kore',
'🇺🇸 🚺 Sarah': 'af_sarah',
'🇺🇸 🚺 Nova': 'af_nova',
'🇺🇸 🚺 Sky': 'af_sky',
'🇺🇸 🚺 Alloy': 'af_alloy',
'🇺🇸 🚺 Jessica': 'af_jessica',
'🇺🇸 🚺 River': 'af_river',
'🇺🇸 🚹 Michael': 'am_michael',
'🇺🇸 🚹 Fenrir': 'am_fenrir',
'🇺🇸 🚹 Puck': 'am_puck',
'🇺🇸 🚹 Echo': 'am_echo',
'🇺🇸 🚹 Eric': 'am_eric',
'🇺🇸 🚹 Liam': 'am_liam',
'🇺🇸 🚹 Onyx': 'am_onyx',
'🇺🇸 🚹 Santa': 'am_santa',
'🇺🇸 🚹 Adam': 'am_adam',
'🇬🇧 🚺 Emma': 'bf_emma',
'🇬🇧 🚺 Isabella': 'bf_isabella',
'🇬🇧 🚺 Alice': 'bf_alice',
'🇬🇧 🚺 Lily': 'bf_lily',
'🇬🇧 🚹 George': 'bm_george',
'🇬🇧 🚹 Fable': 'bm_fable',
'🇬🇧 🚹 Lewis': 'bm_lewis',
'🇬🇧 🚹 Daniel': 'bm_daniel',
}
for v in CHOICES.values():
pipelines[v[0]].load_voice(v)
TOKEN_NOTE = '''
💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
💬 To adjust intonation, try punctuation `;:,.!?—…"()“”` or stress `ˈ` and `ˌ`
⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
'''
with gr.Blocks() as generate_tab:
out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
generate_btn = gr.Button('Generate', variant='primary')
with gr.Accordion('Output Tokens', open=True):
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
tokenize_btn = gr.Button('Tokenize', variant='secondary')
gr.Markdown(TOKEN_NOTE)
predict_btn = gr.Button('Predict', variant='secondary', visible=False)
STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
if CHAR_LIMIT is not None:
STREAM_NOTE.append(f'✂️ Each stream is capped at {CHAR_LIMIT} characters.')
STREAM_NOTE.append('🚀 Want more characters? You can [use Kokoro directly](https://huggingface.co/hexgrad/Kokoro-82M#usage) or duplicate this space:')
STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
with gr.Blocks() as stream_tab:
out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
with gr.Row():
stream_btn = gr.Button('Stream', variant='primary')
stop_btn = gr.Button('Stop', variant='stop')
with gr.Accordion('Note', open=True):
gr.Markdown(STREAM_NOTE)
gr.DuplicateButton()
BANNER_TEXT = '''
[***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
This demo uses native US English and British English speakers. But also supports multiple languages using G2P and phonemizer. Select your language below!
'''
API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
API_NAME = None if API_OPEN else False
# Language choices for the dropdown
LANGUAGE_CHOICES = [
['English (US)', 'en-us'],
['Lojban', 'jb'],
['Czech (Non-native)', 'cs'],
['Danish (Non-native)', 'da'],
['Dutch (Non-native)', 'nl'],
['Estonian (Non-native)', 'et'],
['Finnish (Non-native)', 'fi'],
['French (Non-native)', 'fr'],
['German (Non-native)', 'de'],
['Greek (Non-native)', 'el'],
['Italian (Non-native)', 'it'],
['Norwegian (Non-native)', 'no'],
['Polish (Non-native)', 'pl'],
['Portuguese (Non-native)', 'pt'],
['Russian (Non-native)', 'ru'],
['Slovene (Non-native)', 'sl'],
['Spanish (Non-native)', 'es'],
['Swedish (Non-native)', 'sv'],
['Turkish (Non-native)', 'tr'],
]
with gr.Blocks() as app:
with gr.Row():
gr.Markdown(BANNER_TEXT, container=True)
with gr.Row():
with gr.Column():
text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream. Supports IPA within (<text>)[<IPA>] parethesis and brackets.")
with gr.Row():
voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Quality and availability vary by language')
use_gpu = gr.Dropdown(
[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
value=CUDA_AVAILABLE,
label='Hardware',
info='GPU is usually faster, but has a usage quota',
interactive=CUDA_AVAILABLE
)
with gr.Row():
if (PYGORUUT_AVAILABLE):
# Goruut
lang = gr.Dropdown(
label="Available Languages",
choices=goruut_langs.get_all_supported_languages(),
interactive=True,
allow_custom_value=False
)
else:
# G2P
lang = gr.Dropdown(
LANGUAGE_CHOICES,
value='en-us',
label="Language",
info="Select language for G2P processing"
)
speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
random_btn = gr.Button('🎲 Random Quote 💬', variant='secondary')
with gr.Row():
gatsby_btn = gr.Button('🥂 Gatsby 📕', variant='secondary')
frankenstein_btn = gr.Button('💀 Frankenstein 📗', variant='secondary')
with gr.Column():
gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
random_btn.click(fn=get_random_quote, inputs=[], outputs=[text], api_name=API_NAME)
gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text], api_name=API_NAME)
frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text], api_name=API_NAME)
generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu, lang], outputs=[out_audio, out_ps], api_name=API_NAME)
tokenize_btn.click(fn=tokenize_first, inputs=[text, voice, lang], outputs=[out_ps], api_name=API_NAME)
stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu, lang], outputs=[out_stream], api_name=API_NAME)
stop_btn.click(fn=None, cancels=stream_event)
predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
if __name__ == '__main__':
app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True, mcp_server=API_OPEN) |