Spaces:
Running
Running
| import gradio as gr | |
| import numpy as np | |
| import tempfile | |
| import os | |
| from kittentts import KittenTTS | |
| import soundfile as sf | |
| # Initialize the TTS model | |
| print("Loading KittenTTS model from Hugging Face...") | |
| try: | |
| tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1") | |
| print("β KittenTTS model loaded successfully!") | |
| except Exception as e: | |
| print(f"β Error loading model: {e}") | |
| print("Make sure the kittentts package is properly installed") | |
| raise | |
| # Available voices from the model | |
| AVAILABLE_VOICES = [ | |
| 'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f', | |
| 'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f' | |
| ] | |
| # Create friendly voice names mapping | |
| VOICE_MAPPING = { | |
| "Voice 2 - Male": "expr-voice-2-m", | |
| "Voice 2 - Female": "expr-voice-2-f", | |
| "Voice 3 - Male": "expr-voice-3-m", | |
| "Voice 3 - Female": "expr-voice-3-f", | |
| "Voice 4 - Male": "expr-voice-4-m", | |
| "Voice 4 - Female": "expr-voice-4-f", | |
| "Voice 5 - Male": "expr-voice-5-m", | |
| "Voice 5 - Female": "expr-voice-5-f", | |
| } | |
| print(f"β Available voices: {AVAILABLE_VOICES}") | |
| MAX_CHARS = 420 # we don't know the exact limit at this point - works experimentally | |
| def generate_speech(text, voice_choice): | |
| """ | |
| Generate speech from text using KittenTTS with voice selection | |
| Args: | |
| text (str): The text to convert to speech | |
| voice_choice (str): The selected voice option | |
| Returns: | |
| tuple: (sample_rate, audio_array) for Gradio audio component | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter some text to generate speech." | |
| # Check text length - KittenTTS nano model has context limitations | |
| if len(text) > MAX_CHARS: | |
| return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters." | |
| text = text + " ..." # Added because the model cuts off the audio sometimes. | |
| try: | |
| # Get voice identifier | |
| voice_id = None | |
| if voice_choice in VOICE_MAPPING: | |
| voice_id = VOICE_MAPPING[voice_choice] | |
| print(f"Using voice: {voice_choice} ({voice_id})") | |
| # Generate audio using KittenTTS | |
| if voice_id is not None: | |
| # Use specific voice | |
| audio = tts_model.generate(text, voice=voice_id) | |
| else: | |
| # Fall back to default voice | |
| audio = tts_model.generate(text) | |
| # KittenTTS returns audio at 24kHz sample rate | |
| sample_rate = 24000 | |
| # Ensure audio is in the right format for Gradio | |
| if isinstance(audio, np.ndarray): | |
| # Make sure audio is float32 and in the right range | |
| audio = audio.astype(np.float32) | |
| if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0): | |
| audio = audio / np.max(np.abs(audio)) | |
| voice_msg = f" with {voice_choice}" if voice_id is not None else "" | |
| char_count = len(text) | |
| return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)" | |
| except Exception as e: | |
| error_msg = str(e) | |
| print(f"Error details: {e}") | |
| # Provide helpful error messages for common issues | |
| if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg: | |
| return None, "Text is too long or complex for the model. Please try shorter, simpler text." | |
| elif "ONNXRuntimeError" in error_msg: | |
| return None, "Model processing error. Try shorter text or simpler punctuation." | |
| else: | |
| return None, f"Error generating speech: {error_msg}" | |
| def create_interface(): | |
| """Create the Gradio interface""" | |
| with gr.Blocks( | |
| title="KittenTTS - High Quality Text-to-Speech", | |
| theme=gr.themes.Soft(font=["Arial", "sans-serif"]), | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π± KittenTTS - High Quality Text-to-Speech | |
| Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1), | |
| a lightweight TTS model that works without GPU! | |
| Choose from multiple voice options and enter your text to hear the synthesized speech. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Voice selection | |
| voice_dropdown = gr.Dropdown( | |
| choices=list(VOICE_MAPPING.keys()), | |
| value=list(VOICE_MAPPING.keys())[0], | |
| label="π€ Select Voice", | |
| info="Choose between different male and female voices" | |
| ) | |
| # Text input | |
| text_input = gr.Textbox( | |
| label="Text to Speech", | |
| placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...", | |
| lines=3, | |
| max_length=MAX_CHARS, | |
| show_copy_button=True, | |
| info="Keep text short and simple for the nano model" | |
| ) | |
| # Generate button | |
| generate_btn = gr.Button( | |
| "π΅ Generate Speech", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Status message | |
| status_msg = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| show_label=True | |
| ) | |
| with gr.Column(scale=1): | |
| # Audio output | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="numpy", | |
| interactive=False | |
| ) | |
| # Example texts | |
| gr.Markdown("### π Example Texts to Try (Short & Simple):") | |
| examples = [ | |
| ["Hello world! This is KittenTTS.", "Voice 2 - Female"], | |
| ["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"], | |
| ["This model works without a GPU.", "Voice 4 - Female"], | |
| ["Welcome to KittenTTS!", "Voice 5 - Male"], | |
| ["How are you today?", "Voice 2 - Male"], | |
| ["The weather is nice today.", "Voice 3 - Female"] | |
| ] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[text_input, voice_dropdown], | |
| label="Click on any example to try it out" | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_dropdown], | |
| outputs=[audio_output, status_msg], | |
| show_progress=True | |
| ) | |
| # Also allow Enter key to generate | |
| text_input.submit( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_dropdown], | |
| outputs=[audio_output, status_msg], | |
| show_progress=True | |
| ) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| **About KittenTTS Nano:** | |
| - Lightweight 15M parameter text-to-speech model | |
| - Works without GPU - optimized for efficiency | |
| - Multiple voice options (male and female variants) | |
| - 24kHz output sample rate | |
| - **Best with short texts (under 400 characters)** | |
| - Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1) | |
| - Built by [KittenML](https://github.com/KittenML/KittenTTS) | |
| **Usage Tips for Nano Model:** | |
| - β Keep text short and simple (about 400 characters) | |
| - β Use common words and standard punctuation | |
| - β Break long content into shorter sentences | |
| - β Avoid very long sentences or complex punctuation | |
| - β Avoid technical jargon or unusual words | |
| """) | |
| return demo | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| # Launch the app | |
| demo.launch( | |
| server_name="0.0.0.0", # Allow external connections | |
| server_port=7860, # Standard port for HF Spaces | |
| share=False, # Don't create a public link (HF Spaces handles this) | |
| show_error=True, # Show errors in the interface | |
| quiet=False # Show startup logs | |
| ) |