Spaces:

fxPracht
/

KittenTTSNano

Running

App Files Files Community

KittenTTSNano / app.py

fxPracht

updated text box

22a2bf5 verified 4 months ago

raw

history blame contribute delete

8.37 kB

	import gradio as gr
	import numpy as np
	import tempfile
	import os
	from kittentts import KittenTTS
	import soundfile as sf

	# Initialize the TTS model
	print("Loading KittenTTS model from Hugging Face...")
	try:
	tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
	print("✅ KittenTTS model loaded successfully!")
	except Exception as e:
	print(f"❌ Error loading model: {e}")
	print("Make sure the kittentts package is properly installed")
	raise

	# Available voices from the model
	AVAILABLE_VOICES = [
	'expr-voice-2-m', 'expr-voice-2-f', 'expr-voice-3-m', 'expr-voice-3-f',
	'expr-voice-4-m', 'expr-voice-4-f', 'expr-voice-5-m', 'expr-voice-5-f'
	]

	# Create friendly voice names mapping
	VOICE_MAPPING = {
	"Voice 2 - Male": "expr-voice-2-m",
	"Voice 2 - Female": "expr-voice-2-f",
	"Voice 3 - Male": "expr-voice-3-m",
	"Voice 3 - Female": "expr-voice-3-f",
	"Voice 4 - Male": "expr-voice-4-m",
	"Voice 4 - Female": "expr-voice-4-f",
	"Voice 5 - Male": "expr-voice-5-m",
	"Voice 5 - Female": "expr-voice-5-f",
	}

	print(f"✅ Available voices: {AVAILABLE_VOICES}")

	MAX_CHARS = 420 # we don't know the exact limit at this point - works experimentally

	def generate_speech(text, voice_choice):
	"""
	Generate speech from text using KittenTTS with voice selection

	Args:
	text (str): The text to convert to speech
	voice_choice (str): The selected voice option

	Returns:
	tuple: (sample_rate, audio_array) for Gradio audio component
	"""
	if not text.strip():
	return None, "Please enter some text to generate speech."

	# Check text length - KittenTTS nano model has context limitations
	if len(text) > MAX_CHARS:
	return None, f"Text too long! Please limit to {MAX_CHARS} characters. Current length: {len(text)} characters."

	text = text + " ..." # Added because the model cuts off the audio sometimes.

	try:
	# Get voice identifier
	voice_id = None
	if voice_choice in VOICE_MAPPING:
	voice_id = VOICE_MAPPING[voice_choice]
	print(f"Using voice: {voice_choice} ({voice_id})")

	# Generate audio using KittenTTS
	if voice_id is not None:
	# Use specific voice
	audio = tts_model.generate(text, voice=voice_id)
	else:
	# Fall back to default voice
	audio = tts_model.generate(text)

	# KittenTTS returns audio at 24kHz sample rate
	sample_rate = 24000

	# Ensure audio is in the right format for Gradio
	if isinstance(audio, np.ndarray):
	# Make sure audio is float32 and in the right range
	audio = audio.astype(np.float32)
	if len(audio) > 0 and (audio.max() > 1.0 or audio.min() < -1.0):
	audio = audio / np.max(np.abs(audio))

	voice_msg = f" with {voice_choice}" if voice_id is not None else ""
	char_count = len(text)
	return (sample_rate, audio), f"Speech generated successfully{voice_msg}! ({char_count} characters)"

	except Exception as e:
	error_msg = str(e)
	print(f"Error details: {e}")

	# Provide helpful error messages for common issues
	if "INVALID_ARGUMENT" in error_msg and "Expand" in error_msg:
	return None, "Text is too long or complex for the model. Please try shorter, simpler text."
	elif "ONNXRuntimeError" in error_msg:
	return None, "Model processing error. Try shorter text or simpler punctuation."
	else:
	return None, f"Error generating speech: {error_msg}"

	def create_interface():
	"""Create the Gradio interface"""

	with gr.Blocks(
	title="KittenTTS - High Quality Text-to-Speech",
	theme=gr.themes.Soft(font=["Arial", "sans-serif"]),
	) as demo:

	gr.Markdown("""
	# 🐱 KittenTTS - High Quality Text-to-Speech

	Generate high-quality speech from text using [KittenTTS](https://huggingface.co/KittenML/kitten-tts-nano-0.1),
	a lightweight TTS model that works without GPU!

	Choose from multiple voice options and enter your text to hear the synthesized speech.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Voice selection
	voice_dropdown = gr.Dropdown(
	choices=list(VOICE_MAPPING.keys()),
	value=list(VOICE_MAPPING.keys())[0],
	label="🎤 Select Voice",
	info="Choose between different male and female voices"
	)

	# Text input
	text_input = gr.Textbox(
	label="Text to Speech",
	placeholder=f"Enter text (max {MAX_CHARS} characters for best results)...",
	lines=3,
	max_length=MAX_CHARS,
	show_copy_button=True,
	info="Keep text short and simple for the nano model"
	)

	# Generate button
	generate_btn = gr.Button(
	"🎵 Generate Speech",
	variant="primary",
	size="lg"
	)

	# Status message
	status_msg = gr.Textbox(
	label="Status",
	interactive=False,
	show_label=True
	)

	with gr.Column(scale=1):
	# Audio output
	audio_output = gr.Audio(
	label="Generated Speech",
	type="numpy",
	interactive=False
	)

	# Example texts
	gr.Markdown("### 📝 Example Texts to Try (Short & Simple):")
	examples = [
	["Hello world! This is KittenTTS.", "Voice 2 - Female"],
	["The quick brown fox jumps over the lazy dog.", "Voice 3 - Male"],
	["This model works without a GPU.", "Voice 4 - Female"],
	["Welcome to KittenTTS!", "Voice 5 - Male"],
	["How are you today?", "Voice 2 - Male"],
	["The weather is nice today.", "Voice 3 - Female"]
	]

	gr.Examples(
	examples=examples,
	inputs=[text_input, voice_dropdown],
	label="Click on any example to try it out"
	)

	# Event handlers
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_msg],
	show_progress=True
	)

	# Also allow Enter key to generate
	text_input.submit(
	fn=generate_speech,
	inputs=[text_input, voice_dropdown],
	outputs=[audio_output, status_msg],
	show_progress=True
	)

	# Footer
	gr.Markdown("""
	---

	About KittenTTS Nano:
	- Lightweight 15M parameter text-to-speech model
	- Works without GPU - optimized for efficiency
	- Multiple voice options (male and female variants)
	- 24kHz output sample rate
	- Best with short texts (under 400 characters)
	- Model: [KittenML/kitten-tts-nano-0.1](https://huggingface.co/KittenML/kitten-tts-nano-0.1)
	- Built by [KittenML](https://github.com/KittenML/KittenTTS)

	Usage Tips for Nano Model:
	- ✅ Keep text short and simple (about 400 characters)
	- ✅ Use common words and standard punctuation
	- ✅ Break long content into shorter sentences
	- ❌ Avoid very long sentences or complex punctuation
	- ❌ Avoid technical jargon or unusual words
	""")

	return demo

	# Create and launch the interface
	if __name__ == "__main__":
	demo = create_interface()

	# Launch the app
	demo.launch(
	server_name="0.0.0.0", # Allow external connections
	server_port=7860, # Standard port for HF Spaces
	share=False, # Don't create a public link (HF Spaces handles this)
	show_error=True, # Show errors in the interface
	quiet=False # Show startup logs
	)