Spaces:
Running
feat(tts): add text-to-speech UI and backend
Browse files- [feat] Implement text-to-speech generation with proxy, timeout, and error handling (tts_handler.py:1-155:generate_text_to_speech(), handle_text_to_speech_generation())
- [feat] Add `create_tts_tab()` for "Text-to-Speech" Gradio tab components (ui_components.py:410-509)
- [feat] Add `create_tts_presets()` and `create_tts_examples()` for TTS UI population (ui_components.py:540-550, 553-560)
- [update] Modify header and footer Markdown to describe TTS feature (ui_components.py:590-591, 624-632)
- [feat] Import TTS-related constants and data (ui_components.py:9-11)
- [feat] Add default TTS model, provider, voice, and example text constants (utils.py:15-16, 68-71, 74-99, 102-110)
- [feat] Import TTS handler and UI tab creation functions (app.py:6, 9)
- [feat] Integrate TTS tab into the main application (app.py:39:create_app())
- app.py +5 -0
- tts_handler.py +155 -0
- ui_components.py +139 -2
- utils.py +52 -0
|
@@ -6,11 +6,13 @@ A comprehensive AI platform with chat and image generation capabilities.
|
|
| 6 |
import gradio as gr
|
| 7 |
from chat_handler import handle_chat_submit, handle_chat_retry
|
| 8 |
from image_handler import handle_image_generation, handle_image_to_image_generation
|
|
|
|
| 9 |
from ui_components import (
|
| 10 |
create_main_header,
|
| 11 |
create_chat_tab,
|
| 12 |
create_image_tab,
|
| 13 |
create_image_to_image_tab,
|
|
|
|
| 14 |
create_footer
|
| 15 |
)
|
| 16 |
from utils import get_gradio_theme
|
|
@@ -35,6 +37,9 @@ def create_app():
|
|
| 35 |
|
| 36 |
# Image-to-image tab
|
| 37 |
create_image_to_image_tab(handle_image_to_image_generation)
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# Footer with helpful information
|
| 40 |
create_footer()
|
|
|
|
| 6 |
import gradio as gr
|
| 7 |
from chat_handler import handle_chat_submit, handle_chat_retry
|
| 8 |
from image_handler import handle_image_generation, handle_image_to_image_generation
|
| 9 |
+
from tts_handler import handle_text_to_speech_generation
|
| 10 |
from ui_components import (
|
| 11 |
create_main_header,
|
| 12 |
create_chat_tab,
|
| 13 |
create_image_tab,
|
| 14 |
create_image_to_image_tab,
|
| 15 |
+
create_tts_tab,
|
| 16 |
create_footer
|
| 17 |
)
|
| 18 |
from utils import get_gradio_theme
|
|
|
|
| 37 |
|
| 38 |
# Image-to-image tab
|
| 39 |
create_image_to_image_tab(handle_image_to_image_generation)
|
| 40 |
+
|
| 41 |
+
# Text-to-speech tab
|
| 42 |
+
create_tts_tab(handle_text_to_speech_generation)
|
| 43 |
|
| 44 |
# Footer with helpful information
|
| 45 |
create_footer()
|
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-speech functionality handler for HF-Inferoxy AI Hub.
|
| 3 |
+
Handles text-to-speech generation with multiple providers.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import time
|
| 8 |
+
import threading
|
| 9 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
|
| 10 |
+
from huggingface_hub import InferenceClient
|
| 11 |
+
from huggingface_hub.errors import HfHubHTTPError
|
| 12 |
+
from requests.exceptions import ConnectionError, Timeout, RequestException
|
| 13 |
+
from hf_token_utils import get_proxy_token, report_token_status
|
| 14 |
+
from utils import (
|
| 15 |
+
IMAGE_CONFIG,
|
| 16 |
+
validate_proxy_key,
|
| 17 |
+
format_error_message,
|
| 18 |
+
format_success_message
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# Timeout configuration for TTS generation
|
| 22 |
+
TTS_GENERATION_TIMEOUT = 300 # 5 minutes max for TTS generation
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def generate_text_to_speech(
|
| 26 |
+
text: str,
|
| 27 |
+
model_name: str,
|
| 28 |
+
provider: str,
|
| 29 |
+
voice: str = "am_eric",
|
| 30 |
+
speed: float = 1.0,
|
| 31 |
+
):
|
| 32 |
+
"""
|
| 33 |
+
Generate speech from text using the specified model and provider through HF-Inferoxy.
|
| 34 |
+
"""
|
| 35 |
+
# Validate proxy API key
|
| 36 |
+
is_valid, error_msg = validate_proxy_key()
|
| 37 |
+
if not is_valid:
|
| 38 |
+
return None, error_msg
|
| 39 |
+
|
| 40 |
+
proxy_api_key = os.getenv("PROXY_KEY")
|
| 41 |
+
|
| 42 |
+
token_id = None
|
| 43 |
+
try:
|
| 44 |
+
# Get token from HF-Inferoxy proxy server with timeout handling
|
| 45 |
+
print(f"π TTS: Requesting token from proxy...")
|
| 46 |
+
token, token_id = get_proxy_token(api_key=proxy_api_key)
|
| 47 |
+
print(f"β
TTS: Got token: {token_id}")
|
| 48 |
+
|
| 49 |
+
print(f"π€ TTS: Using model='{model_name}', provider='{provider}', voice='{voice}'")
|
| 50 |
+
|
| 51 |
+
# Create client with specified provider
|
| 52 |
+
client = InferenceClient(
|
| 53 |
+
provider=provider,
|
| 54 |
+
api_key=token
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
print(f"π TTS: Client created, preparing generation params...")
|
| 58 |
+
|
| 59 |
+
# Prepare generation parameters
|
| 60 |
+
generation_params = {
|
| 61 |
+
"text": text,
|
| 62 |
+
"model": model_name,
|
| 63 |
+
"extra_body": {
|
| 64 |
+
"voice": voice,
|
| 65 |
+
"speed": speed
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
print(f"π‘ TTS: Making generation request with {TTS_GENERATION_TIMEOUT}s timeout...")
|
| 70 |
+
|
| 71 |
+
# Create generation function for timeout handling
|
| 72 |
+
def generate_audio_task():
|
| 73 |
+
return client.text_to_speech(**generation_params)
|
| 74 |
+
|
| 75 |
+
# Execute with timeout using ThreadPoolExecutor
|
| 76 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
| 77 |
+
future = executor.submit(generate_audio_task)
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
# Generate audio with timeout
|
| 81 |
+
audio = future.result(timeout=TTS_GENERATION_TIMEOUT)
|
| 82 |
+
except FutureTimeoutError:
|
| 83 |
+
future.cancel() # Cancel the running task
|
| 84 |
+
raise TimeoutError(f"TTS generation timed out after {TTS_GENERATION_TIMEOUT} seconds")
|
| 85 |
+
|
| 86 |
+
print(f"π΅ TTS: Generation completed! Audio type: {type(audio)}")
|
| 87 |
+
|
| 88 |
+
# Report successful token usage
|
| 89 |
+
if token_id:
|
| 90 |
+
report_token_status(token_id, "success", api_key=proxy_api_key)
|
| 91 |
+
|
| 92 |
+
return audio, format_success_message("Speech generated", f"using {model_name} on {provider} with voice {voice}")
|
| 93 |
+
|
| 94 |
+
except ConnectionError as e:
|
| 95 |
+
# Handle proxy connection errors
|
| 96 |
+
error_msg = f"Cannot connect to HF-Inferoxy server: {str(e)}"
|
| 97 |
+
print(f"π TTS connection error: {error_msg}")
|
| 98 |
+
if token_id:
|
| 99 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 100 |
+
return None, format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
|
| 101 |
+
|
| 102 |
+
except TimeoutError as e:
|
| 103 |
+
# Handle timeout errors
|
| 104 |
+
error_msg = f"TTS generation timed out: {str(e)}"
|
| 105 |
+
print(f"β° TTS timeout: {error_msg}")
|
| 106 |
+
if token_id:
|
| 107 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 108 |
+
return None, format_error_message("Timeout Error", f"TTS generation took too long (>{TTS_GENERATION_TIMEOUT//60} minutes). Try shorter text.")
|
| 109 |
+
|
| 110 |
+
except HfHubHTTPError as e:
|
| 111 |
+
# Handle HuggingFace API errors
|
| 112 |
+
error_msg = str(e)
|
| 113 |
+
print(f"π€ TTS HF error: {error_msg}")
|
| 114 |
+
if token_id:
|
| 115 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 116 |
+
|
| 117 |
+
# Provide more user-friendly error messages
|
| 118 |
+
if "401" in error_msg:
|
| 119 |
+
return None, format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
|
| 120 |
+
elif "402" in error_msg:
|
| 121 |
+
return None, format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
|
| 122 |
+
elif "429" in error_msg:
|
| 123 |
+
return None, format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
|
| 124 |
+
else:
|
| 125 |
+
return None, format_error_message("HuggingFace API Error", error_msg)
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
# Handle all other errors
|
| 129 |
+
error_msg = str(e)
|
| 130 |
+
print(f"β TTS unexpected error: {error_msg}")
|
| 131 |
+
if token_id:
|
| 132 |
+
report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
|
| 133 |
+
return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def handle_text_to_speech_generation(text_val, model_val, provider_val, voice_val, speed_val):
|
| 137 |
+
"""
|
| 138 |
+
Handle text-to-speech generation request with validation.
|
| 139 |
+
"""
|
| 140 |
+
# Validate input text
|
| 141 |
+
if not text_val or not text_val.strip():
|
| 142 |
+
return None, format_error_message("Validation Error", "Please enter some text to convert to speech")
|
| 143 |
+
|
| 144 |
+
# Limit text length to prevent timeouts
|
| 145 |
+
if len(text_val) > 5000:
|
| 146 |
+
return None, format_error_message("Validation Error", "Text is too long. Please keep it under 5000 characters.")
|
| 147 |
+
|
| 148 |
+
# Generate speech
|
| 149 |
+
return generate_text_to_speech(
|
| 150 |
+
text=text_val.strip(),
|
| 151 |
+
model_name=model_val,
|
| 152 |
+
provider=provider_val,
|
| 153 |
+
voice=voice_val,
|
| 154 |
+
speed=speed_val
|
| 155 |
+
)
|
|
@@ -7,8 +7,10 @@ import gradio as gr
|
|
| 7 |
from utils import (
|
| 8 |
DEFAULT_CHAT_MODEL, DEFAULT_IMAGE_MODEL, DEFAULT_IMAGE_PROVIDER,
|
| 9 |
DEFAULT_IMAGE_TO_IMAGE_MODEL, DEFAULT_IMAGE_TO_IMAGE_PROVIDER,
|
|
|
|
| 10 |
CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
|
| 11 |
-
IMAGE_TO_IMAGE_MODEL_PRESETS,
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
|
|
@@ -408,6 +410,110 @@ def create_image_to_image_tab(handle_image_to_image_generation_fn):
|
|
| 408 |
gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
|
| 409 |
|
| 410 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
def create_image_to_image_presets(img2img_model_name, img2img_provider):
|
| 412 |
"""Create quick model presets for image-to-image generation."""
|
| 413 |
with gr.Group():
|
|
@@ -431,6 +537,29 @@ def create_image_to_image_examples(img2img_prompt):
|
|
| 431 |
)
|
| 432 |
|
| 433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
def create_image_presets(img_model_name, img_provider):
|
| 435 |
"""Create quick model presets for image generation."""
|
| 436 |
with gr.Group():
|
|
@@ -459,12 +588,13 @@ def create_main_header():
|
|
| 459 |
gr.Markdown("""
|
| 460 |
# π HF-Inferoxy AI Hub
|
| 461 |
|
| 462 |
-
A comprehensive AI platform combining chat
|
| 463 |
|
| 464 |
**Features:**
|
| 465 |
- π¬ **Smart Chat**: Conversational AI with streaming responses
|
| 466 |
- π¨ **Image Generation**: Text-to-image creation with multiple providers
|
| 467 |
- πΌοΈ **Image-to-Image**: Transform and modify existing images with AI
|
|
|
|
| 468 |
- π **Intelligent Token Management**: Automatic token rotation and error handling
|
| 469 |
- π **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
|
| 470 |
""")
|
|
@@ -494,6 +624,13 @@ def create_footer():
|
|
| 494 |
- Perfect for style transfers, object additions, and image transformations
|
| 495 |
- Works great with models like Qwen Image Edit and FLUX.1 Kontext
|
| 496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
**Supported Providers:**
|
| 498 |
- **fal-ai**: High-quality image generation (default for images)
|
| 499 |
- **hf-inference**: Core API with comprehensive model support
|
|
|
|
| 7 |
from utils import (
|
| 8 |
DEFAULT_CHAT_MODEL, DEFAULT_IMAGE_MODEL, DEFAULT_IMAGE_PROVIDER,
|
| 9 |
DEFAULT_IMAGE_TO_IMAGE_MODEL, DEFAULT_IMAGE_TO_IMAGE_PROVIDER,
|
| 10 |
+
DEFAULT_TTS_MODEL, DEFAULT_TTS_PROVIDER,
|
| 11 |
CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
|
| 12 |
+
IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES,
|
| 13 |
+
IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS
|
| 14 |
)
|
| 15 |
|
| 16 |
|
|
|
|
| 410 |
gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
|
| 411 |
|
| 412 |
|
| 413 |
+
def create_tts_tab(handle_tts_generation_fn):
|
| 414 |
+
"""
|
| 415 |
+
Create the text-to-speech tab interface.
|
| 416 |
+
"""
|
| 417 |
+
with gr.Tab("π€ Text-to-Speech", id="tts"):
|
| 418 |
+
with gr.Row():
|
| 419 |
+
with gr.Column(scale=2):
|
| 420 |
+
# Text input
|
| 421 |
+
tts_text = gr.Textbox(
|
| 422 |
+
value=TTS_EXAMPLE_TEXTS[0], # Use first example as default
|
| 423 |
+
label="Text to Convert",
|
| 424 |
+
lines=6,
|
| 425 |
+
placeholder="Enter the text you want to convert to speech..."
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
# Audio output
|
| 429 |
+
output_audio = gr.Audio(
|
| 430 |
+
label="Generated Audio",
|
| 431 |
+
type="numpy",
|
| 432 |
+
interactive=False,
|
| 433 |
+
autoplay=True
|
| 434 |
+
)
|
| 435 |
+
status_text = gr.Textbox(
|
| 436 |
+
label="Generation Status",
|
| 437 |
+
interactive=False,
|
| 438 |
+
lines=2
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
with gr.Column(scale=1):
|
| 442 |
+
# Model and provider inputs
|
| 443 |
+
with gr.Group():
|
| 444 |
+
gr.Markdown("**π€ Model & Provider**")
|
| 445 |
+
tts_model_name = gr.Textbox(
|
| 446 |
+
value=DEFAULT_TTS_MODEL,
|
| 447 |
+
label="Model Name",
|
| 448 |
+
placeholder="e.g., hexgrad/Kokoro-82M"
|
| 449 |
+
)
|
| 450 |
+
tts_provider = gr.Dropdown(
|
| 451 |
+
choices=IMAGE_PROVIDERS,
|
| 452 |
+
value=DEFAULT_TTS_PROVIDER,
|
| 453 |
+
label="Provider",
|
| 454 |
+
interactive=True
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
# Voice and speed settings
|
| 458 |
+
with gr.Group():
|
| 459 |
+
gr.Markdown("**π€ Voice Settings**")
|
| 460 |
+
tts_voice = gr.Dropdown(
|
| 461 |
+
choices=list(TTS_VOICES.items()),
|
| 462 |
+
value="am_eric",
|
| 463 |
+
label="Voice",
|
| 464 |
+
info="Choose from various English voices"
|
| 465 |
+
)
|
| 466 |
+
tts_speed = gr.Slider(
|
| 467 |
+
minimum=0.5, maximum=2.0, value=1.0, step=0.1,
|
| 468 |
+
label="Speed", info="0.5 = slow, 2.0 = fast"
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
# Generate and Stop buttons
|
| 472 |
+
with gr.Row():
|
| 473 |
+
generate_btn = gr.Button(
|
| 474 |
+
"π€ Generate Speech",
|
| 475 |
+
variant="primary",
|
| 476 |
+
size="lg",
|
| 477 |
+
scale=2
|
| 478 |
+
)
|
| 479 |
+
stop_generate_btn = gr.Button("βΉ Stop", variant="secondary", visible=False)
|
| 480 |
+
|
| 481 |
+
# Quick model presets
|
| 482 |
+
create_tts_presets(tts_model_name, tts_provider)
|
| 483 |
+
|
| 484 |
+
# Examples for TTS generation
|
| 485 |
+
create_tts_examples(tts_text)
|
| 486 |
+
|
| 487 |
+
# Connect TTS generation events
|
| 488 |
+
# Show stop immediately when starting generation
|
| 489 |
+
generate_btn.click(
|
| 490 |
+
fn=lambda: gr.update(visible=True),
|
| 491 |
+
inputs=None,
|
| 492 |
+
outputs=[stop_generate_btn],
|
| 493 |
+
queue=False
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
gen_event = generate_btn.click(
|
| 497 |
+
fn=handle_tts_generation_fn,
|
| 498 |
+
inputs=[
|
| 499 |
+
tts_text, tts_model_name, tts_provider, tts_voice, tts_speed
|
| 500 |
+
],
|
| 501 |
+
outputs=[output_audio, status_text]
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
# Stop current TTS generation
|
| 505 |
+
stop_generate_btn.click(
|
| 506 |
+
fn=lambda: gr.update(visible=False),
|
| 507 |
+
inputs=None,
|
| 508 |
+
outputs=[stop_generate_btn],
|
| 509 |
+
cancels=[gen_event],
|
| 510 |
+
queue=False
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
# Hide stop after generation completes
|
| 514 |
+
gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
|
| 515 |
+
|
| 516 |
+
|
| 517 |
def create_image_to_image_presets(img2img_model_name, img2img_provider):
|
| 518 |
"""Create quick model presets for image-to-image generation."""
|
| 519 |
with gr.Group():
|
|
|
|
| 537 |
)
|
| 538 |
|
| 539 |
|
| 540 |
+
def create_tts_presets(tts_model_name, tts_provider):
|
| 541 |
+
"""Create quick model presets for text-to-speech generation."""
|
| 542 |
+
with gr.Group():
|
| 543 |
+
gr.Markdown("**π― Popular Presets**")
|
| 544 |
+
|
| 545 |
+
for name, model, provider in TTS_MODEL_PRESETS:
|
| 546 |
+
btn = gr.Button(name, size="sm")
|
| 547 |
+
btn.click(
|
| 548 |
+
lambda m=model, p=provider: (m, p),
|
| 549 |
+
outputs=[tts_model_name, tts_provider]
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def create_tts_examples(tts_text):
|
| 554 |
+
"""Create example texts for text-to-speech generation."""
|
| 555 |
+
with gr.Group():
|
| 556 |
+
gr.Markdown("**π Example Texts**")
|
| 557 |
+
tts_examples = gr.Examples(
|
| 558 |
+
examples=[[text] for text in TTS_EXAMPLE_TEXTS],
|
| 559 |
+
inputs=tts_text
|
| 560 |
+
)
|
| 561 |
+
|
| 562 |
+
|
| 563 |
def create_image_presets(img_model_name, img_provider):
|
| 564 |
"""Create quick model presets for image generation."""
|
| 565 |
with gr.Group():
|
|
|
|
| 588 |
gr.Markdown("""
|
| 589 |
# π HF-Inferoxy AI Hub
|
| 590 |
|
| 591 |
+
A comprehensive AI platform combining chat, image generation, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
|
| 592 |
|
| 593 |
**Features:**
|
| 594 |
- π¬ **Smart Chat**: Conversational AI with streaming responses
|
| 595 |
- π¨ **Image Generation**: Text-to-image creation with multiple providers
|
| 596 |
- πΌοΈ **Image-to-Image**: Transform and modify existing images with AI
|
| 597 |
+
- π€ **Text-to-Speech**: Convert text to natural-sounding speech with Kokoro
|
| 598 |
- π **Intelligent Token Management**: Automatic token rotation and error handling
|
| 599 |
- π **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
|
| 600 |
""")
|
|
|
|
| 624 |
- Perfect for style transfers, object additions, and image transformations
|
| 625 |
- Works great with models like Qwen Image Edit and FLUX.1 Kontext
|
| 626 |
|
| 627 |
+
**Text-to-Speech Tab:**
|
| 628 |
+
- Enter text you want to convert to speech
|
| 629 |
+
- Choose from various English voices (US and UK accents)
|
| 630 |
+
- Adjust speed from 0.5x to 2.0x
|
| 631 |
+
- Powered by Kokoro TTS model for natural-sounding speech
|
| 632 |
+
- Supports both fal-ai and replicate providers
|
| 633 |
+
|
| 634 |
**Supported Providers:**
|
| 635 |
- **fal-ai**: High-quality image generation (default for images)
|
| 636 |
- **hf-inference**: Core API with comprehensive model support
|
|
@@ -12,6 +12,8 @@ DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
|
|
| 12 |
DEFAULT_IMAGE_PROVIDER = "fal-ai"
|
| 13 |
DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
|
| 14 |
DEFAULT_IMAGE_TO_IMAGE_PROVIDER = "fal-ai"
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Chat configuration
|
| 17 |
CHAT_CONFIG = {
|
|
@@ -66,6 +68,44 @@ IMAGE_TO_IMAGE_MODEL_PRESETS = [
|
|
| 66 |
("SDXL (HF)", "stabilityai/stable-diffusion-xl-base-1.0", "hf-inference"),
|
| 67 |
]
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# Example prompts for image generation
|
| 70 |
IMAGE_EXAMPLE_PROMPTS = [
|
| 71 |
"A majestic dragon flying over a medieval castle, epic fantasy art, detailed, 8k",
|
|
@@ -90,6 +130,18 @@ IMAGE_TO_IMAGE_EXAMPLE_PROMPTS = [
|
|
| 90 |
"Add a magical portal in the background with sparkles"
|
| 91 |
]
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
def get_proxy_key():
|
| 95 |
"""Get the proxy API key from environment variables."""
|
|
|
|
| 12 |
DEFAULT_IMAGE_PROVIDER = "fal-ai"
|
| 13 |
DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
|
| 14 |
DEFAULT_IMAGE_TO_IMAGE_PROVIDER = "fal-ai"
|
| 15 |
+
DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
|
| 16 |
+
DEFAULT_TTS_PROVIDER = "fal-ai"
|
| 17 |
|
| 18 |
# Chat configuration
|
| 19 |
CHAT_CONFIG = {
|
|
|
|
| 68 |
("SDXL (HF)", "stabilityai/stable-diffusion-xl-base-1.0", "hf-inference"),
|
| 69 |
]
|
| 70 |
|
| 71 |
+
# Model presets for text-to-speech generation
|
| 72 |
+
TTS_MODEL_PRESETS = [
|
| 73 |
+
("Kokoro (Fal.ai)", "hexgrad/Kokoro-82M", "fal-ai"),
|
| 74 |
+
("Kokoro (Replicate)", "hexgrad/Kokoro-82M", "replicate"),
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
# Voice options for Kokoro TTS (based on the reference app)
|
| 78 |
+
TTS_VOICES = {
|
| 79 |
+
'πΊπΈ πΊ Heart β€οΈ': 'af_heart',
|
| 80 |
+
'πΊπΈ πΊ Bella π₯': 'af_bella',
|
| 81 |
+
'πΊπΈ πΊ Nicole π§': 'af_nicole',
|
| 82 |
+
'πΊπΈ πΊ Aoede': 'af_aoede',
|
| 83 |
+
'πΊπΈ πΊ Kore': 'af_kore',
|
| 84 |
+
'πΊπΈ πΊ Sarah': 'af_sarah',
|
| 85 |
+
'πΊπΈ πΊ Nova': 'af_nova',
|
| 86 |
+
'πΊπΈ πΊ Sky': 'af_sky',
|
| 87 |
+
'πΊπΈ πΊ Alloy': 'af_alloy',
|
| 88 |
+
'πΊπΈ πΊ Jessica': 'af_jessica',
|
| 89 |
+
'πΊπΈ πΊ River': 'af_river',
|
| 90 |
+
'πΊπΈ πΉ Michael': 'am_michael',
|
| 91 |
+
'πΊπΈ πΉ Fenrir': 'am_fenrir',
|
| 92 |
+
'πΊπΈ πΉ Puck': 'am_puck',
|
| 93 |
+
'πΊπΈ πΉ Echo': 'am_echo',
|
| 94 |
+
'πΊπΈ πΉ Eric': 'am_eric',
|
| 95 |
+
'πΊπΈ πΉ Liam': 'am_liam',
|
| 96 |
+
'πΊπΈ πΉ Onyx': 'am_onyx',
|
| 97 |
+
'πΊπΈ πΉ Santa': 'am_santa',
|
| 98 |
+
'πΊπΈ πΉ Adam': 'am_adam',
|
| 99 |
+
'π¬π§ πΊ Emma': 'bf_emma',
|
| 100 |
+
'π¬π§ πΊ Isabella': 'bf_isabella',
|
| 101 |
+
'π¬π§ πΊ Alice': 'bf_alice',
|
| 102 |
+
'π¬π§ πΊ Lily': 'bf_lily',
|
| 103 |
+
'π¬π§ πΉ George': 'bm_george',
|
| 104 |
+
'π¬π§ πΉ Fable': 'bm_fable',
|
| 105 |
+
'π¬π§ πΉ Lewis': 'bm_lewis',
|
| 106 |
+
'π¬π§ πΉ Daniel': 'bm_daniel',
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
# Example prompts for image generation
|
| 110 |
IMAGE_EXAMPLE_PROMPTS = [
|
| 111 |
"A majestic dragon flying over a medieval castle, epic fantasy art, detailed, 8k",
|
|
|
|
| 130 |
"Add a magical portal in the background with sparkles"
|
| 131 |
]
|
| 132 |
|
| 133 |
+
# Example texts for text-to-speech generation
|
| 134 |
+
TTS_EXAMPLE_TEXTS = [
|
| 135 |
+
"Hello! Welcome to the amazing world of AI-powered text-to-speech technology.",
|
| 136 |
+
"The quick brown fox jumps over the lazy dog. This pangram contains every letter of the alphabet.",
|
| 137 |
+
"In a world where technology advances at lightning speed, artificial intelligence continues to reshape our future.",
|
| 138 |
+
"Imagine a world where machines can understand and respond to human emotions with perfect clarity.",
|
| 139 |
+
"The future belongs to those who believe in the beauty of their dreams and have the courage to pursue them.",
|
| 140 |
+
"Science is not only compatible with spirituality; it is a profound source of spirituality.",
|
| 141 |
+
"The only way to do great work is to love what you do. If you haven't found it yet, keep looking.",
|
| 142 |
+
"Life is what happens when you're busy making other plans. Embrace every moment with gratitude."
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
|
| 146 |
def get_proxy_key():
|
| 147 |
"""Get the proxy API key from environment variables."""
|