Macbook
Add FastAPI application
cc4ea58
raw
history blame
10.6 kB
"""
Therapy TTS Module - Text-to-speech for therapy and AAC applications.
Supports:
- WhisperSpeech (fast, voice cloning)
- OpenAI TTS API (fallback)
- Edge TTS (lightweight fallback)
"""
import io
import logging
from enum import Enum
from typing import Optional
from dataclasses import dataclass
from api.config import settings
if settings.ENVIRONMENT == "development":
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.WARNING)
class TTSEngine(str, Enum):
"""Available TTS engines."""
WHISPERSPEECH = "whisperspeech"
OPENAI_TTS = "openai_tts"
EDGE_TTS = "edge_tts"
AUTO = "auto"
class TTSVoice(str, Enum):
"""Preset voice options."""
NEUTRAL = "neutral"
WARM = "warm"
CLEAR = "clear"
SLOW = "slow" # For therapy exercises
CUSTOM = "custom" # Voice cloning
@dataclass
class TTSResult:
"""TTS synthesis result."""
audio_bytes: bytes
format: str # wav, mp3
sample_rate: int
engine_used: TTSEngine
duration_seconds: Optional[float] = None
class TherapyTTS:
"""
TTS engine for therapy applications.
Features:
- Voice cloning from reference audio
- Adjustable speed for therapy exercises
- Multiple engine support with fallback
"""
def __init__(self, default_engine: TTSEngine = TTSEngine.AUTO):
self.default_engine = default_engine
self._whisperspeech_pipe = None
self._openai_client = None
def _get_openai_client(self):
"""Lazy load OpenAI client."""
if self._openai_client is None:
from openai import OpenAI
self._openai_client = OpenAI(api_key=settings.OPENAI_API_KEY)
return self._openai_client
def _get_whisperspeech(self):
"""Lazy load WhisperSpeech pipeline."""
if self._whisperspeech_pipe is None:
try:
from whisperspeech.pipeline import Pipeline
logging.info("Loading WhisperSpeech pipeline...")
self._whisperspeech_pipe = Pipeline(
s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model'
)
logging.info("WhisperSpeech loaded successfully")
except ImportError as e:
logging.warning(f"WhisperSpeech not available: {e}")
raise
return self._whisperspeech_pipe
def _select_engine(self, voice_reference: Optional[bytes] = None) -> TTSEngine:
"""Select TTS engine based on requirements."""
if self.default_engine != TTSEngine.AUTO:
return self.default_engine
# Use WhisperSpeech for voice cloning
if voice_reference:
return TTSEngine.WHISPERSPEECH
# Default to OpenAI for quality
return TTSEngine.OPENAI_TTS
def synthesize(
self,
text: str,
voice: TTSVoice = TTSVoice.NEUTRAL,
speed: float = 1.0,
voice_reference: Optional[bytes] = None,
engine: Optional[TTSEngine] = None,
output_format: str = "wav"
) -> TTSResult:
"""
Synthesize speech from text.
Args:
text: Text to synthesize
voice: Voice preset to use
speed: Speech rate (0.5 = slow, 1.0 = normal, 2.0 = fast)
voice_reference: Audio bytes for voice cloning
engine: Force specific engine
output_format: Output format (wav, mp3)
Returns:
TTSResult with audio bytes
"""
selected_engine = engine or self._select_engine(voice_reference)
logging.info(f"Synthesizing with engine: {selected_engine.value}")
# Fallback chain
fallback_order = [selected_engine]
if selected_engine != TTSEngine.OPENAI_TTS:
fallback_order.append(TTSEngine.OPENAI_TTS)
last_error = None
for eng in fallback_order:
try:
if eng == TTSEngine.OPENAI_TTS:
return self._synthesize_openai(text, voice, speed, output_format)
elif eng == TTSEngine.WHISPERSPEECH:
return self._synthesize_whisperspeech(
text, voice_reference, speed, output_format
)
elif eng == TTSEngine.EDGE_TTS:
return self._synthesize_edge_tts(text, voice, speed, output_format)
except Exception as e:
logging.warning(f"Engine {eng.value} failed: {e}")
last_error = e
continue
raise RuntimeError(f"All TTS engines failed. Last error: {last_error}")
def _synthesize_openai(
self,
text: str,
voice: TTSVoice,
speed: float,
output_format: str
) -> TTSResult:
"""Synthesize using OpenAI TTS API."""
logging.info("Synthesizing with OpenAI TTS")
client = self._get_openai_client()
# Map voice presets to OpenAI voices
voice_map = {
TTSVoice.NEUTRAL: "alloy",
TTSVoice.WARM: "nova",
TTSVoice.CLEAR: "onyx",
TTSVoice.SLOW: "alloy", # Use speed parameter
TTSVoice.CUSTOM: "alloy",
}
response = client.audio.speech.create(
model="tts-1",
voice=voice_map.get(voice, "alloy"),
input=text,
speed=speed,
response_format="wav" if output_format == "wav" else "mp3"
)
audio_bytes = response.content
return TTSResult(
audio_bytes=audio_bytes,
format=output_format,
sample_rate=24000,
engine_used=TTSEngine.OPENAI_TTS
)
def _synthesize_whisperspeech(
self,
text: str,
voice_reference: Optional[bytes],
speed: float,
output_format: str
) -> TTSResult:
"""Synthesize using WhisperSpeech with optional voice cloning."""
logging.info("Synthesizing with WhisperSpeech")
import torch
import numpy as np
pipe = self._get_whisperspeech()
# Generate audio
if voice_reference:
# Voice cloning mode
import tempfile
import os
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(voice_reference)
ref_path = f.name
try:
audio = pipe.generate(text, speaker=ref_path)
finally:
os.unlink(ref_path)
else:
audio = pipe.generate(text)
# Convert to bytes
if isinstance(audio, torch.Tensor):
audio_np = audio.cpu().numpy()
else:
audio_np = np.array(audio)
# Ensure correct shape
if audio_np.ndim > 1:
audio_np = audio_np.squeeze()
# Apply speed adjustment if needed
if speed != 1.0:
import librosa
audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
# Convert to wav bytes
import soundfile as sf
buffer = io.BytesIO()
sf.write(buffer, audio_np, 24000, format='WAV')
buffer.seek(0)
return TTSResult(
audio_bytes=buffer.read(),
format="wav",
sample_rate=24000,
engine_used=TTSEngine.WHISPERSPEECH,
duration_seconds=len(audio_np) / 24000
)
def _synthesize_edge_tts(
self,
text: str,
voice: TTSVoice,
speed: float,
output_format: str
) -> TTSResult:
"""Synthesize using Edge TTS (lightweight fallback)."""
logging.info("Synthesizing with Edge TTS")
import asyncio
import edge_tts
# Map voice presets to Edge TTS voices
voice_map = {
TTSVoice.NEUTRAL: "en-US-JennyNeural",
TTSVoice.WARM: "en-US-AriaNeural",
TTSVoice.CLEAR: "en-US-GuyNeural",
TTSVoice.SLOW: "en-US-JennyNeural",
TTSVoice.CUSTOM: "en-US-JennyNeural",
}
async def _generate():
communicate = edge_tts.Communicate(
text,
voice_map.get(voice, "en-US-JennyNeural"),
rate=f"{int((speed - 1) * 100):+d}%"
)
buffer = io.BytesIO()
async for chunk in communicate.stream():
if chunk["type"] == "audio":
buffer.write(chunk["data"])
return buffer.getvalue()
audio_bytes = asyncio.run(_generate())
return TTSResult(
audio_bytes=audio_bytes,
format="mp3",
sample_rate=24000,
engine_used=TTSEngine.EDGE_TTS
)
def generate_therapy_prompt(
self,
exercise_type: str,
target_text: str,
**kwargs
) -> TTSResult:
"""
Generate therapy exercise audio prompt.
Args:
exercise_type: Type of exercise (repeat_after_me, pronunciation, etc.)
target_text: The text to practice
**kwargs: Additional synthesis parameters
Returns:
TTSResult with exercise audio
"""
prompts = {
"repeat_after_me": f"Please repeat after me: {target_text}",
"pronunciation": f"Let's practice saying: {target_text}. Listen carefully.",
"slower": f"Now try saying it more slowly: {target_text}",
"word_by_word": f"Let's break it down. {target_text}",
"encouragement": f"Great try! Let's practice {target_text} again.",
}
prompt_text = prompts.get(exercise_type, target_text)
# Use slower speed for therapy prompts
speed = kwargs.pop("speed", 0.9)
return self.synthesize(
text=prompt_text,
speed=speed,
voice=TTSVoice.CLEAR,
**kwargs
)
# Singleton instance
_therapy_tts_instance: Optional[TherapyTTS] = None
def get_therapy_tts() -> TherapyTTS:
"""Get or create TherapyTTS singleton."""
global _therapy_tts_instance
if _therapy_tts_instance is None:
_therapy_tts_instance = TherapyTTS()
return _therapy_tts_instance
def synthesize_speech(
text: str,
voice: TTSVoice = TTSVoice.NEUTRAL,
speed: float = 1.0,
voice_reference: Optional[bytes] = None
) -> TTSResult:
"""Convenience function for TTS synthesis."""
tts = get_therapy_tts()
return tts.synthesize(
text=text,
voice=voice,
speed=speed,
voice_reference=voice_reference
)