Spaces:
Sleeping
Sleeping
| """ | |
| Therapy TTS Module - Text-to-speech for therapy and AAC applications. | |
| Supports: | |
| - WhisperSpeech (fast, voice cloning) | |
| - OpenAI TTS API (fallback) | |
| - Edge TTS (lightweight fallback) | |
| """ | |
| import io | |
| import logging | |
| from enum import Enum | |
| from typing import Optional | |
| from dataclasses import dataclass | |
| from api.config import settings | |
| if settings.ENVIRONMENT == "development": | |
| logging.basicConfig(level=logging.DEBUG) | |
| else: | |
| logging.basicConfig(level=logging.WARNING) | |
| class TTSEngine(str, Enum): | |
| """Available TTS engines.""" | |
| WHISPERSPEECH = "whisperspeech" | |
| OPENAI_TTS = "openai_tts" | |
| EDGE_TTS = "edge_tts" | |
| AUTO = "auto" | |
| class TTSVoice(str, Enum): | |
| """Preset voice options.""" | |
| NEUTRAL = "neutral" | |
| WARM = "warm" | |
| CLEAR = "clear" | |
| SLOW = "slow" # For therapy exercises | |
| CUSTOM = "custom" # Voice cloning | |
| class TTSResult: | |
| """TTS synthesis result.""" | |
| audio_bytes: bytes | |
| format: str # wav, mp3 | |
| sample_rate: int | |
| engine_used: TTSEngine | |
| duration_seconds: Optional[float] = None | |
| class TherapyTTS: | |
| """ | |
| TTS engine for therapy applications. | |
| Features: | |
| - Voice cloning from reference audio | |
| - Adjustable speed for therapy exercises | |
| - Multiple engine support with fallback | |
| """ | |
| def __init__(self, default_engine: TTSEngine = TTSEngine.AUTO): | |
| self.default_engine = default_engine | |
| self._whisperspeech_pipe = None | |
| self._openai_client = None | |
| def _get_openai_client(self): | |
| """Lazy load OpenAI client.""" | |
| if self._openai_client is None: | |
| from openai import OpenAI | |
| self._openai_client = OpenAI(api_key=settings.OPENAI_API_KEY) | |
| return self._openai_client | |
| def _get_whisperspeech(self): | |
| """Lazy load WhisperSpeech pipeline.""" | |
| if self._whisperspeech_pipe is None: | |
| try: | |
| from whisperspeech.pipeline import Pipeline | |
| logging.info("Loading WhisperSpeech pipeline...") | |
| self._whisperspeech_pipe = Pipeline( | |
| s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model' | |
| ) | |
| logging.info("WhisperSpeech loaded successfully") | |
| except ImportError as e: | |
| logging.warning(f"WhisperSpeech not available: {e}") | |
| raise | |
| return self._whisperspeech_pipe | |
| def _select_engine(self, voice_reference: Optional[bytes] = None) -> TTSEngine: | |
| """Select TTS engine based on requirements.""" | |
| if self.default_engine != TTSEngine.AUTO: | |
| return self.default_engine | |
| # Use WhisperSpeech for voice cloning | |
| if voice_reference: | |
| return TTSEngine.WHISPERSPEECH | |
| # Default to OpenAI for quality | |
| return TTSEngine.OPENAI_TTS | |
| def synthesize( | |
| self, | |
| text: str, | |
| voice: TTSVoice = TTSVoice.NEUTRAL, | |
| speed: float = 1.0, | |
| voice_reference: Optional[bytes] = None, | |
| engine: Optional[TTSEngine] = None, | |
| output_format: str = "wav" | |
| ) -> TTSResult: | |
| """ | |
| Synthesize speech from text. | |
| Args: | |
| text: Text to synthesize | |
| voice: Voice preset to use | |
| speed: Speech rate (0.5 = slow, 1.0 = normal, 2.0 = fast) | |
| voice_reference: Audio bytes for voice cloning | |
| engine: Force specific engine | |
| output_format: Output format (wav, mp3) | |
| Returns: | |
| TTSResult with audio bytes | |
| """ | |
| selected_engine = engine or self._select_engine(voice_reference) | |
| logging.info(f"Synthesizing with engine: {selected_engine.value}") | |
| # Fallback chain | |
| fallback_order = [selected_engine] | |
| if selected_engine != TTSEngine.OPENAI_TTS: | |
| fallback_order.append(TTSEngine.OPENAI_TTS) | |
| last_error = None | |
| for eng in fallback_order: | |
| try: | |
| if eng == TTSEngine.OPENAI_TTS: | |
| return self._synthesize_openai(text, voice, speed, output_format) | |
| elif eng == TTSEngine.WHISPERSPEECH: | |
| return self._synthesize_whisperspeech( | |
| text, voice_reference, speed, output_format | |
| ) | |
| elif eng == TTSEngine.EDGE_TTS: | |
| return self._synthesize_edge_tts(text, voice, speed, output_format) | |
| except Exception as e: | |
| logging.warning(f"Engine {eng.value} failed: {e}") | |
| last_error = e | |
| continue | |
| raise RuntimeError(f"All TTS engines failed. Last error: {last_error}") | |
| def _synthesize_openai( | |
| self, | |
| text: str, | |
| voice: TTSVoice, | |
| speed: float, | |
| output_format: str | |
| ) -> TTSResult: | |
| """Synthesize using OpenAI TTS API.""" | |
| logging.info("Synthesizing with OpenAI TTS") | |
| client = self._get_openai_client() | |
| # Map voice presets to OpenAI voices | |
| voice_map = { | |
| TTSVoice.NEUTRAL: "alloy", | |
| TTSVoice.WARM: "nova", | |
| TTSVoice.CLEAR: "onyx", | |
| TTSVoice.SLOW: "alloy", # Use speed parameter | |
| TTSVoice.CUSTOM: "alloy", | |
| } | |
| response = client.audio.speech.create( | |
| model="tts-1", | |
| voice=voice_map.get(voice, "alloy"), | |
| input=text, | |
| speed=speed, | |
| response_format="wav" if output_format == "wav" else "mp3" | |
| ) | |
| audio_bytes = response.content | |
| return TTSResult( | |
| audio_bytes=audio_bytes, | |
| format=output_format, | |
| sample_rate=24000, | |
| engine_used=TTSEngine.OPENAI_TTS | |
| ) | |
| def _synthesize_whisperspeech( | |
| self, | |
| text: str, | |
| voice_reference: Optional[bytes], | |
| speed: float, | |
| output_format: str | |
| ) -> TTSResult: | |
| """Synthesize using WhisperSpeech with optional voice cloning.""" | |
| logging.info("Synthesizing with WhisperSpeech") | |
| import torch | |
| import numpy as np | |
| pipe = self._get_whisperspeech() | |
| # Generate audio | |
| if voice_reference: | |
| # Voice cloning mode | |
| import tempfile | |
| import os | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| f.write(voice_reference) | |
| ref_path = f.name | |
| try: | |
| audio = pipe.generate(text, speaker=ref_path) | |
| finally: | |
| os.unlink(ref_path) | |
| else: | |
| audio = pipe.generate(text) | |
| # Convert to bytes | |
| if isinstance(audio, torch.Tensor): | |
| audio_np = audio.cpu().numpy() | |
| else: | |
| audio_np = np.array(audio) | |
| # Ensure correct shape | |
| if audio_np.ndim > 1: | |
| audio_np = audio_np.squeeze() | |
| # Apply speed adjustment if needed | |
| if speed != 1.0: | |
| import librosa | |
| audio_np = librosa.effects.time_stretch(audio_np, rate=speed) | |
| # Convert to wav bytes | |
| import soundfile as sf | |
| buffer = io.BytesIO() | |
| sf.write(buffer, audio_np, 24000, format='WAV') | |
| buffer.seek(0) | |
| return TTSResult( | |
| audio_bytes=buffer.read(), | |
| format="wav", | |
| sample_rate=24000, | |
| engine_used=TTSEngine.WHISPERSPEECH, | |
| duration_seconds=len(audio_np) / 24000 | |
| ) | |
| def _synthesize_edge_tts( | |
| self, | |
| text: str, | |
| voice: TTSVoice, | |
| speed: float, | |
| output_format: str | |
| ) -> TTSResult: | |
| """Synthesize using Edge TTS (lightweight fallback).""" | |
| logging.info("Synthesizing with Edge TTS") | |
| import asyncio | |
| import edge_tts | |
| # Map voice presets to Edge TTS voices | |
| voice_map = { | |
| TTSVoice.NEUTRAL: "en-US-JennyNeural", | |
| TTSVoice.WARM: "en-US-AriaNeural", | |
| TTSVoice.CLEAR: "en-US-GuyNeural", | |
| TTSVoice.SLOW: "en-US-JennyNeural", | |
| TTSVoice.CUSTOM: "en-US-JennyNeural", | |
| } | |
| async def _generate(): | |
| communicate = edge_tts.Communicate( | |
| text, | |
| voice_map.get(voice, "en-US-JennyNeural"), | |
| rate=f"{int((speed - 1) * 100):+d}%" | |
| ) | |
| buffer = io.BytesIO() | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| buffer.write(chunk["data"]) | |
| return buffer.getvalue() | |
| audio_bytes = asyncio.run(_generate()) | |
| return TTSResult( | |
| audio_bytes=audio_bytes, | |
| format="mp3", | |
| sample_rate=24000, | |
| engine_used=TTSEngine.EDGE_TTS | |
| ) | |
| def generate_therapy_prompt( | |
| self, | |
| exercise_type: str, | |
| target_text: str, | |
| **kwargs | |
| ) -> TTSResult: | |
| """ | |
| Generate therapy exercise audio prompt. | |
| Args: | |
| exercise_type: Type of exercise (repeat_after_me, pronunciation, etc.) | |
| target_text: The text to practice | |
| **kwargs: Additional synthesis parameters | |
| Returns: | |
| TTSResult with exercise audio | |
| """ | |
| prompts = { | |
| "repeat_after_me": f"Please repeat after me: {target_text}", | |
| "pronunciation": f"Let's practice saying: {target_text}. Listen carefully.", | |
| "slower": f"Now try saying it more slowly: {target_text}", | |
| "word_by_word": f"Let's break it down. {target_text}", | |
| "encouragement": f"Great try! Let's practice {target_text} again.", | |
| } | |
| prompt_text = prompts.get(exercise_type, target_text) | |
| # Use slower speed for therapy prompts | |
| speed = kwargs.pop("speed", 0.9) | |
| return self.synthesize( | |
| text=prompt_text, | |
| speed=speed, | |
| voice=TTSVoice.CLEAR, | |
| **kwargs | |
| ) | |
| # Singleton instance | |
| _therapy_tts_instance: Optional[TherapyTTS] = None | |
| def get_therapy_tts() -> TherapyTTS: | |
| """Get or create TherapyTTS singleton.""" | |
| global _therapy_tts_instance | |
| if _therapy_tts_instance is None: | |
| _therapy_tts_instance = TherapyTTS() | |
| return _therapy_tts_instance | |
| def synthesize_speech( | |
| text: str, | |
| voice: TTSVoice = TTSVoice.NEUTRAL, | |
| speed: float = 1.0, | |
| voice_reference: Optional[bytes] = None | |
| ) -> TTSResult: | |
| """Convenience function for TTS synthesis.""" | |
| tts = get_therapy_tts() | |
| return tts.synthesize( | |
| text=text, | |
| voice=voice, | |
| speed=speed, | |
| voice_reference=voice_reference | |
| ) | |