Spaces:

nazdridoy
/

inferoxy-hub

Running

App Files Files Community

nazdridoy commited on Aug 22

Commit

0d77564

verified ·

1 Parent(s): 1675f59

feat(tts): add text-to-speech UI and backend

Browse files

- [feat] Implement text-to-speech generation with proxy, timeout, and error handling (tts_handler.py:1-155:generate_text_to_speech(), handle_text_to_speech_generation())
- [feat] Add `create_tts_tab()` for "Text-to-Speech" Gradio tab components (ui_components.py:410-509)
- [feat] Add `create_tts_presets()` and `create_tts_examples()` for TTS UI population (ui_components.py:540-550, 553-560)
- [update] Modify header and footer Markdown to describe TTS feature (ui_components.py:590-591, 624-632)
- [feat] Import TTS-related constants and data (ui_components.py:9-11)
- [feat] Add default TTS model, provider, voice, and example text constants (utils.py:15-16, 68-71, 74-99, 102-110)
- [feat] Import TTS handler and UI tab creation functions (app.py:6, 9)
- [feat] Integrate TTS tab into the main application (app.py:39:create_app())

Files changed (4) hide show

app.py +5 -0
tts_handler.py +155 -0
ui_components.py +139 -2
utils.py +52 -0

app.py CHANGED Viewed

@@ -6,11 +6,13 @@ A comprehensive AI platform with chat and image generation capabilities.
 import gradio as gr
 from chat_handler import handle_chat_submit, handle_chat_retry
 from image_handler import handle_image_generation, handle_image_to_image_generation
 from ui_components import (
     create_main_header,
     create_chat_tab,
     create_image_tab,
     create_image_to_image_tab,
     create_footer
 )
 from utils import get_gradio_theme
@@ -35,6 +37,9 @@ def create_app():
             # Image-to-image tab
             create_image_to_image_tab(handle_image_to_image_generation)
         # Footer with helpful information
         create_footer()

 import gradio as gr
 from chat_handler import handle_chat_submit, handle_chat_retry
 from image_handler import handle_image_generation, handle_image_to_image_generation
+from tts_handler import handle_text_to_speech_generation
 from ui_components import (
     create_main_header,
     create_chat_tab,
     create_image_tab,
     create_image_to_image_tab,
+    create_tts_tab,
     create_footer
 )
 from utils import get_gradio_theme
             # Image-to-image tab
             create_image_to_image_tab(handle_image_to_image_generation)
+            # Text-to-speech tab
+            create_tts_tab(handle_text_to_speech_generation)
         # Footer with helpful information
         create_footer()

tts_handler.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+Text-to-speech functionality handler for HF-Inferoxy AI Hub.
+Handles text-to-speech generation with multiple providers.
+"""
+import os
+import time
+import threading
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
+from huggingface_hub import InferenceClient
+from huggingface_hub.errors import HfHubHTTPError
+from requests.exceptions import ConnectionError, Timeout, RequestException
+from hf_token_utils import get_proxy_token, report_token_status
+from utils import (
+    IMAGE_CONFIG,
+    validate_proxy_key,
+    format_error_message,
+    format_success_message
+)
+# Timeout configuration for TTS generation
+TTS_GENERATION_TIMEOUT = 300  # 5 minutes max for TTS generation
+def generate_text_to_speech(
+    text: str,
+    model_name: str,
+    provider: str,
+    voice: str = "am_eric",
+    speed: float = 1.0,
+):
+    """
+    Generate speech from text using the specified model and provider through HF-Inferoxy.
+    """
+    # Validate proxy API key
+    is_valid, error_msg = validate_proxy_key()
+    if not is_valid:
+        return None, error_msg
+    proxy_api_key = os.getenv("PROXY_KEY")
+    token_id = None
+    try:
+        # Get token from HF-Inferoxy proxy server with timeout handling
+        print(f"🔑 TTS: Requesting token from proxy...")
+        token, token_id = get_proxy_token(api_key=proxy_api_key)
+        print(f"✅ TTS: Got token: {token_id}")
+        print(f"🎤 TTS: Using model='{model_name}', provider='{provider}', voice='{voice}'")
+        # Create client with specified provider
+        client = InferenceClient(
+            provider=provider,
+            api_key=token
+        )
+        print(f"🚀 TTS: Client created, preparing generation params...")
+        # Prepare generation parameters
+        generation_params = {
+            "text": text,
+            "model": model_name,
+            "extra_body": {
+                "voice": voice,
+                "speed": speed
+            }
+        }
+        print(f"📡 TTS: Making generation request with {TTS_GENERATION_TIMEOUT}s timeout...")
+        # Create generation function for timeout handling
+        def generate_audio_task():
+            return client.text_to_speech(**generation_params)
+        # Execute with timeout using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(generate_audio_task)
+            try:
+                # Generate audio with timeout
+                audio = future.result(timeout=TTS_GENERATION_TIMEOUT)
+            except FutureTimeoutError:
+                future.cancel()  # Cancel the running task
+                raise TimeoutError(f"TTS generation timed out after {TTS_GENERATION_TIMEOUT} seconds")
+        print(f"🎵 TTS: Generation completed! Audio type: {type(audio)}")
+        # Report successful token usage
+        if token_id:
+            report_token_status(token_id, "success", api_key=proxy_api_key)
+        return audio, format_success_message("Speech generated", f"using {model_name} on {provider} with voice {voice}")
+    except ConnectionError as e:
+        # Handle proxy connection errors
+        error_msg = f"Cannot connect to HF-Inferoxy server: {str(e)}"
+        print(f"🔌 TTS connection error: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        return None, format_error_message("Connection Error", "Unable to connect to the proxy server. Please check if it's running.")
+    except TimeoutError as e:
+        # Handle timeout errors
+        error_msg = f"TTS generation timed out: {str(e)}"
+        print(f"⏰ TTS timeout: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        return None, format_error_message("Timeout Error", f"TTS generation took too long (>{TTS_GENERATION_TIMEOUT//60} minutes). Try shorter text.")
+    except HfHubHTTPError as e:
+        # Handle HuggingFace API errors
+        error_msg = str(e)
+        print(f"🤗 TTS HF error: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        # Provide more user-friendly error messages
+        if "401" in error_msg:
+            return None, format_error_message("Authentication Error", "Invalid or expired API token. The proxy will provide a new token on retry.")
+        elif "402" in error_msg:
+            return None, format_error_message("Quota Exceeded", "API quota exceeded. The proxy will try alternative providers.")
+        elif "429" in error_msg:
+            return None, format_error_message("Rate Limited", "Too many requests. Please wait a moment and try again.")
+        else:
+            return None, format_error_message("HuggingFace API Error", error_msg)
+    except Exception as e:
+        # Handle all other errors
+        error_msg = str(e)
+        print(f"❌ TTS unexpected error: {error_msg}")
+        if token_id:
+            report_token_status(token_id, "error", error_msg, api_key=proxy_api_key)
+        return None, format_error_message("Unexpected Error", f"An unexpected error occurred: {error_msg}")
+def handle_text_to_speech_generation(text_val, model_val, provider_val, voice_val, speed_val):
+    """
+    Handle text-to-speech generation request with validation.
+    """
+    # Validate input text
+    if not text_val or not text_val.strip():
+        return None, format_error_message("Validation Error", "Please enter some text to convert to speech")
+    # Limit text length to prevent timeouts
+    if len(text_val) > 5000:
+        return None, format_error_message("Validation Error", "Text is too long. Please keep it under 5000 characters.")
+    # Generate speech
+    return generate_text_to_speech(
+        text=text_val.strip(),
+        model_name=model_val,
+        provider=provider_val,
+        voice=voice_val,
+        speed=speed_val
+    )

ui_components.py CHANGED Viewed

@@ -7,8 +7,10 @@ import gradio as gr
 from utils import (
     DEFAULT_CHAT_MODEL, DEFAULT_IMAGE_MODEL, DEFAULT_IMAGE_PROVIDER,
     DEFAULT_IMAGE_TO_IMAGE_MODEL, DEFAULT_IMAGE_TO_IMAGE_PROVIDER,
     CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
-    IMAGE_TO_IMAGE_MODEL_PRESETS, IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS
 )
@@ -408,6 +410,110 @@ def create_image_to_image_tab(handle_image_to_image_generation_fn):
         gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
 def create_image_to_image_presets(img2img_model_name, img2img_provider):
     """Create quick model presets for image-to-image generation."""
     with gr.Group():
@@ -431,6 +537,29 @@ def create_image_to_image_examples(img2img_prompt):
         )
 def create_image_presets(img_model_name, img_provider):
     """Create quick model presets for image generation."""
     with gr.Group():
@@ -459,12 +588,13 @@ def create_main_header():
     gr.Markdown("""
     # 🚀 HF-Inferoxy AI Hub
-    A comprehensive AI platform combining chat and image generation capabilities with intelligent token management through HF-Inferoxy.
     **Features:**
     - 💬 **Smart Chat**: Conversational AI with streaming responses
     - 🎨 **Image Generation**: Text-to-image creation with multiple providers
     - 🖼️ **Image-to-Image**: Transform and modify existing images with AI
     - 🔄 **Intelligent Token Management**: Automatic token rotation and error handling
     - 🌐 **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
     """)
@@ -494,6 +624,13 @@ def create_footer():
     - Perfect for style transfers, object additions, and image transformations
     - Works great with models like Qwen Image Edit and FLUX.1 Kontext
     **Supported Providers:**
     - **fal-ai**: High-quality image generation (default for images)
     - **hf-inference**: Core API with comprehensive model support

 from utils import (
     DEFAULT_CHAT_MODEL, DEFAULT_IMAGE_MODEL, DEFAULT_IMAGE_PROVIDER,
     DEFAULT_IMAGE_TO_IMAGE_MODEL, DEFAULT_IMAGE_TO_IMAGE_PROVIDER,
+    DEFAULT_TTS_MODEL, DEFAULT_TTS_PROVIDER,
     CHAT_CONFIG, IMAGE_CONFIG, IMAGE_PROVIDERS, IMAGE_MODEL_PRESETS,
+    IMAGE_TO_IMAGE_MODEL_PRESETS, TTS_MODEL_PRESETS, TTS_VOICES,
+    IMAGE_EXAMPLE_PROMPTS, IMAGE_TO_IMAGE_EXAMPLE_PROMPTS, TTS_EXAMPLE_TEXTS
 )
         gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
+def create_tts_tab(handle_tts_generation_fn):
+    """
+    Create the text-to-speech tab interface.
+    """
+    with gr.Tab("🎤 Text-to-Speech", id="tts"):
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Text input
+                tts_text = gr.Textbox(
+                    value=TTS_EXAMPLE_TEXTS[0],  # Use first example as default
+                    label="Text to Convert",
+                    lines=6,
+                    placeholder="Enter the text you want to convert to speech..."
+                )
+                # Audio output
+                output_audio = gr.Audio(
+                    label="Generated Audio",
+                    type="numpy",
+                    interactive=False,
+                    autoplay=True
+                )
+                status_text = gr.Textbox(
+                    label="Generation Status",
+                    interactive=False,
+                    lines=2
+                )
+            with gr.Column(scale=1):
+                # Model and provider inputs
+                with gr.Group():
+                    gr.Markdown("**🤖 Model & Provider**")
+                    tts_model_name = gr.Textbox(
+                        value=DEFAULT_TTS_MODEL,
+                        label="Model Name",
+                        placeholder="e.g., hexgrad/Kokoro-82M"
+                    )
+                    tts_provider = gr.Dropdown(
+                        choices=IMAGE_PROVIDERS,
+                        value=DEFAULT_TTS_PROVIDER,
+                        label="Provider",
+                        interactive=True
+                    )
+                # Voice and speed settings
+                with gr.Group():
+                    gr.Markdown("**🎤 Voice Settings**")
+                    tts_voice = gr.Dropdown(
+                        choices=list(TTS_VOICES.items()),
+                        value="am_eric",
+                        label="Voice",
+                        info="Choose from various English voices"
+                    )
+                    tts_speed = gr.Slider(
+                        minimum=0.5, maximum=2.0, value=1.0, step=0.1,
+                        label="Speed", info="0.5 = slow, 2.0 = fast"
+                    )
+                # Generate and Stop buttons
+                with gr.Row():
+                    generate_btn = gr.Button(
+                        "🎤 Generate Speech",
+                        variant="primary",
+                        size="lg",
+                        scale=2
+                    )
+                    stop_generate_btn = gr.Button("⏹ Stop", variant="secondary", visible=False)
+                # Quick model presets
+                create_tts_presets(tts_model_name, tts_provider)
+        # Examples for TTS generation
+        create_tts_examples(tts_text)
+        # Connect TTS generation events
+        # Show stop immediately when starting generation
+        generate_btn.click(
+            fn=lambda: gr.update(visible=True),
+            inputs=None,
+            outputs=[stop_generate_btn],
+            queue=False
+        )
+        gen_event = generate_btn.click(
+            fn=handle_tts_generation_fn,
+            inputs=[
+                tts_text, tts_model_name, tts_provider, tts_voice, tts_speed
+            ],
+            outputs=[output_audio, status_text]
+        )
+        # Stop current TTS generation
+        stop_generate_btn.click(
+            fn=lambda: gr.update(visible=False),
+            inputs=None,
+            outputs=[stop_generate_btn],
+            cancels=[gen_event],
+            queue=False
+        )
+        # Hide stop after generation completes
+        gen_event.then(lambda: gr.update(visible=False), None, [stop_generate_btn], queue=False)
 def create_image_to_image_presets(img2img_model_name, img2img_provider):
     """Create quick model presets for image-to-image generation."""
     with gr.Group():
         )
+def create_tts_presets(tts_model_name, tts_provider):
+    """Create quick model presets for text-to-speech generation."""
+    with gr.Group():
+        gr.Markdown("**🎯 Popular Presets**")
+        for name, model, provider in TTS_MODEL_PRESETS:
+            btn = gr.Button(name, size="sm")
+            btn.click(
+                lambda m=model, p=provider: (m, p),
+                outputs=[tts_model_name, tts_provider]
+            )
+def create_tts_examples(tts_text):
+    """Create example texts for text-to-speech generation."""
+    with gr.Group():
+        gr.Markdown("**🌟 Example Texts**")
+        tts_examples = gr.Examples(
+            examples=[[text] for text in TTS_EXAMPLE_TEXTS],
+            inputs=tts_text
+        )
 def create_image_presets(img_model_name, img_provider):
     """Create quick model presets for image generation."""
     with gr.Group():
     gr.Markdown("""
     # 🚀 HF-Inferoxy AI Hub
+    A comprehensive AI platform combining chat, image generation, and text-to-speech capabilities with intelligent token management through HF-Inferoxy.
     **Features:**
     - 💬 **Smart Chat**: Conversational AI with streaming responses
     - 🎨 **Image Generation**: Text-to-image creation with multiple providers
     - 🖼️ **Image-to-Image**: Transform and modify existing images with AI
+    - 🎤 **Text-to-Speech**: Convert text to natural-sounding speech with Kokoro
     - 🔄 **Intelligent Token Management**: Automatic token rotation and error handling
     - 🌐 **Multi-Provider Support**: Works with HF Inference, Cerebras, Cohere, Groq, Together, Fal.ai, and more
     """)
     - Perfect for style transfers, object additions, and image transformations
     - Works great with models like Qwen Image Edit and FLUX.1 Kontext
+    **Text-to-Speech Tab:**
+    - Enter text you want to convert to speech
+    - Choose from various English voices (US and UK accents)
+    - Adjust speed from 0.5x to 2.0x
+    - Powered by Kokoro TTS model for natural-sounding speech
+    - Supports both fal-ai and replicate providers
     **Supported Providers:**
     - **fal-ai**: High-quality image generation (default for images)
     - **hf-inference**: Core API with comprehensive model support

utils.py CHANGED Viewed

@@ -12,6 +12,8 @@ DEFAULT_IMAGE_MODEL = "Qwen/Qwen-Image"
 DEFAULT_IMAGE_PROVIDER = "fal-ai"
 DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
 DEFAULT_IMAGE_TO_IMAGE_PROVIDER = "fal-ai"
 # Chat configuration
 CHAT_CONFIG = {
@@ -66,6 +68,44 @@ IMAGE_TO_IMAGE_MODEL_PRESETS = [
     ("SDXL (HF)", "stabilityai/stable-diffusion-xl-base-1.0", "hf-inference"),
 ]
 # Example prompts for image generation
 IMAGE_EXAMPLE_PROMPTS = [
     "A majestic dragon flying over a medieval castle, epic fantasy art, detailed, 8k",
@@ -90,6 +130,18 @@ IMAGE_TO_IMAGE_EXAMPLE_PROMPTS = [
     "Add a magical portal in the background with sparkles"
 ]
 def get_proxy_key():
     """Get the proxy API key from environment variables."""

 DEFAULT_IMAGE_PROVIDER = "fal-ai"
 DEFAULT_IMAGE_TO_IMAGE_MODEL = "Qwen/Qwen-Image-Edit"
 DEFAULT_IMAGE_TO_IMAGE_PROVIDER = "fal-ai"
+DEFAULT_TTS_MODEL = "hexgrad/Kokoro-82M"
+DEFAULT_TTS_PROVIDER = "fal-ai"
 # Chat configuration
 CHAT_CONFIG = {
     ("SDXL (HF)", "stabilityai/stable-diffusion-xl-base-1.0", "hf-inference"),
 ]
+# Model presets for text-to-speech generation
+TTS_MODEL_PRESETS = [
+    ("Kokoro (Fal.ai)", "hexgrad/Kokoro-82M", "fal-ai"),
+    ("Kokoro (Replicate)", "hexgrad/Kokoro-82M", "replicate"),
+]
+# Voice options for Kokoro TTS (based on the reference app)
+TTS_VOICES = {
+    '🇺🇸 🚺 Heart ❤️': 'af_heart',
+    '🇺🇸 🚺 Bella 🔥': 'af_bella',
+    '🇺🇸 🚺 Nicole 🎧': 'af_nicole',
+    '🇺🇸 🚺 Aoede': 'af_aoede',
+    '🇺🇸 🚺 Kore': 'af_kore',
+    '🇺🇸 🚺 Sarah': 'af_sarah',
+    '🇺🇸 🚺 Nova': 'af_nova',
+    '🇺🇸 🚺 Sky': 'af_sky',
+    '🇺🇸 🚺 Alloy': 'af_alloy',
+    '🇺🇸 🚺 Jessica': 'af_jessica',
+    '🇺🇸 🚺 River': 'af_river',
+    '🇺🇸 🚹 Michael': 'am_michael',
+    '🇺🇸 🚹 Fenrir': 'am_fenrir',
+    '🇺🇸 🚹 Puck': 'am_puck',
+    '🇺🇸 🚹 Echo': 'am_echo',
+    '🇺🇸 🚹 Eric': 'am_eric',
+    '🇺🇸 🚹 Liam': 'am_liam',
+    '🇺🇸 🚹 Onyx': 'am_onyx',
+    '🇺🇸 🚹 Santa': 'am_santa',
+    '🇺🇸 🚹 Adam': 'am_adam',
+    '🇬🇧 🚺 Emma': 'bf_emma',
+    '🇬🇧 🚺 Isabella': 'bf_isabella',
+    '🇬🇧 🚺 Alice': 'bf_alice',
+    '🇬🇧 🚺 Lily': 'bf_lily',
+    '🇬🇧 🚹 George': 'bm_george',
+    '🇬🇧 🚹 Fable': 'bm_fable',
+    '🇬🇧 🚹 Lewis': 'bm_lewis',
+    '🇬🇧 🚹 Daniel': 'bm_daniel',
+}
 # Example prompts for image generation
 IMAGE_EXAMPLE_PROMPTS = [
     "A majestic dragon flying over a medieval castle, epic fantasy art, detailed, 8k",
     "Add a magical portal in the background with sparkles"
 ]
+# Example texts for text-to-speech generation
+TTS_EXAMPLE_TEXTS = [
+    "Hello! Welcome to the amazing world of AI-powered text-to-speech technology.",
+    "The quick brown fox jumps over the lazy dog. This pangram contains every letter of the alphabet.",
+    "In a world where technology advances at lightning speed, artificial intelligence continues to reshape our future.",
+    "Imagine a world where machines can understand and respond to human emotions with perfect clarity.",
+    "The future belongs to those who believe in the beauty of their dreams and have the courage to pursue them.",
+    "Science is not only compatible with spirituality; it is a profound source of spirituality.",
+    "The only way to do great work is to love what you do. If you haven't found it yet, keep looking.",
+    "Life is what happens when you're busy making other plans. Embrace every moment with gratitude."
+]
 def get_proxy_key():
     """Get the proxy API key from environment variables."""