Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 14 days ago

Commit

d0e54ed

1 Parent(s): b61cc05

Upd maya configs

Browse files

Files changed (5) hide show

.gitignore +1 -0
models.py +46 -8
requirements.txt +1 -8
ui.py +3 -2
voice.py +142 -11

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .env
 .setup.txt
 __pycache__/

 .env
 .setup.txt
+maya.txt
 __pycache__/

models.py CHANGED Viewed

@@ -9,6 +9,14 @@ from logger import logger
 import config
 import spaces
 try:
     from TTS.api import TTS
     TTS_AVAILABLE = True
@@ -242,10 +250,12 @@ def move_model_to_gpu(model_name: str):
     return model
 def initialize_tts_model():
-    """Initialize TTS model for text-to-speech"""
-    if not TTS_AVAILABLE:
-        logger.warning("TTS library not installed. TTS features will be disabled.")
         return None
     if config.global_tts_model is None:
         try:
             # Clear GPU cache before loading
@@ -253,17 +263,45 @@ def initialize_tts_model():
                 torch.cuda.empty_cache()
                 logger.debug("Cleared GPU cache before TTS model loading")
-            logger.info("Initializing TTS model for voice generation...")
-            config.global_tts_model = TTS(model_name=config.TTS_MODEL, progress_bar=False)
-            logger.info("TTS model initialized successfully")
             # Clear cache after loading
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
                 logger.debug("Cleared GPU cache after TTS model loading")
         except Exception as e:
-            logger.warning(f"TTS model initialization failed: {e}")
-            logger.warning("TTS features will be disabled. If pyworld dependency is missing, try: pip install TTS --no-deps && pip install coqui-tts")
             config.global_tts_model = None
             # Clear cache on error
             if torch.cuda.is_available():

 import config
 import spaces
+try:
+    from snac import SNAC
+    SNAC_AVAILABLE = True
+except ImportError:
+    SNAC_AVAILABLE = False
+    SNAC = None
+# For backward compatibility, check TTS library too (but we use Maya1 directly)
 try:
     from TTS.api import TTS
     TTS_AVAILABLE = True
     return model
 def initialize_tts_model():
+    """Initialize Maya1 TTS model for text-to-speech using transformers and SNAC"""
+    if not SNAC_AVAILABLE:
+        logger.warning("SNAC library not installed. Maya1 TTS features will be disabled.")
+        logger.warning("Install with: pip install snac")
         return None
     if config.global_tts_model is None:
         try:
             # Clear GPU cache before loading
                 torch.cuda.empty_cache()
                 logger.debug("Cleared GPU cache before TTS model loading")
+            logger.info("Initializing Maya1 TTS model with Transformers...")
+            # Load Maya1 model and tokenizer
+            model = AutoModelForCausalLM.from_pretrained(
+                config.TTS_MODEL,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+                trust_remote_code=True,
+                token=config.HF_TOKEN
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                config.TTS_MODEL,
+                trust_remote_code=True,
+                token=config.HF_TOKEN
+            )
+            logger.info("Loading SNAC decoder...")
+            snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
+            if torch.cuda.is_available():
+                snac_model = snac_model.to("cuda")
+            # Store as a dictionary with model, tokenizer, and snac_model
+            config.global_tts_model = {
+                "model": model,
+                "tokenizer": tokenizer,
+                "snac_model": snac_model
+            }
+            logger.info("Maya1 TTS model initialized successfully")
             # Clear cache after loading
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
                 logger.debug("Cleared GPU cache after TTS model loading")
         except Exception as e:
+            logger.warning(f"Maya1 TTS model initialization failed: {e}")
+            import traceback
+            logger.warning(f"TTS initialization traceback: {traceback.format_exc()}")
+            logger.warning("TTS features will be disabled. Install dependencies: pip install snac transformers")
             config.global_tts_model = None
             # Clear cache on error
             if torch.cuda.is_available():

requirements.txt CHANGED Viewed

@@ -15,8 +15,6 @@ gradio
 gradio[mcp]
 fastmcp
 # MCP dependencies (required for Gemini MCP)
-# Install MCP SDK: pip install mcp
-# The MCP package provides Model Context Protocol server and client functionality
 mcp>=0.1.0
 nest-asyncio
 google-generativeai
@@ -28,12 +26,7 @@ spaces
 soundfile
 numpy<2.0.0
 setuptools>=65.0.0
-# TTS installation (OPTIONAL) - TTS features work without it
-# If you want TTS functionality, install manually due to pyworld build issues:
-# Option 1: pip install TTS --no-deps && pip install coqui-tts
-# Option 2: pip install TTS (may fail on pyworld, but TTS will work for most models without it)
-# The app will run without TTS - voice generation will be disabled
-# TTS
 # ASR (Automatic Speech Recognition) - Whisper for speech-to-text (via Hugging Face transformers)
 torchaudio

 gradio[mcp]
 fastmcp
 # MCP dependencies (required for Gemini MCP)
 mcp>=0.1.0
 nest-asyncio
 google-generativeai
 soundfile
 numpy<2.0.0
 setuptools>=65.0.0
 # ASR (Automatic Speech Recognition) - Whisper for speech-to-text (via Hugging Face transformers)
 torchaudio
+snac

ui.py CHANGED Viewed

@@ -16,6 +16,7 @@ from models import (
     initialize_tts_model,
     initialize_whisper_model,
     TTS_AVAILABLE,
     WHISPER_AVAILABLE,
 )
 from logger import logger
@@ -362,7 +363,7 @@ def create_demo():
                                     status_lines.append(f"⏳ MedSwin ({model_name}): loading...")
                         # TTS model status (only show if available or if there's an issue)
-                        if TTS_AVAILABLE:
                             if config.global_tts_model is not None:
                                 status_lines.append("✅ TTS (maya1): loaded and ready")
                             else:
@@ -402,7 +403,7 @@ def create_demo():
                         status_lines.append(f"⚠️ MedSwin ({model_name}): not loaded")
                     # TTS model status (only show if available and loaded)
-                    if TTS_AVAILABLE:
                         if config.global_tts_model is not None:
                             status_lines.append("✅ TTS (maya1): loaded and ready")
                         # Don't show if TTS library available but model not loaded (optional feature)

     initialize_tts_model,
     initialize_whisper_model,
     TTS_AVAILABLE,
+    SNAC_AVAILABLE,
     WHISPER_AVAILABLE,
 )
 from logger import logger
                                     status_lines.append(f"⏳ MedSwin ({model_name}): loading...")
                         # TTS model status (only show if available or if there's an issue)
+                        if SNAC_AVAILABLE:
                             if config.global_tts_model is not None:
                                 status_lines.append("✅ TTS (maya1): loaded and ready")
                             else:
                         status_lines.append(f"⚠️ MedSwin ({model_name}): not loaded")
                     # TTS model status (only show if available and loaded)
+                    if SNAC_AVAILABLE:
                         if config.global_tts_model is not None:
                             status_lines.append("✅ TTS (maya1): loaded and ready")
                         # Don't show if TTS library available but model not loaded (optional feature)

voice.py CHANGED Viewed

@@ -8,7 +8,22 @@ import numpy as np
 from logger import logger
 from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
 import config
-from models import TTS_AVAILABLE, WHISPER_AVAILABLE, initialize_tts_model, initialize_whisper_model
 import spaces
 try:
@@ -408,7 +423,52 @@ def _generate_speech_via_mcp(text: str):
         logger.warning(f"MCP TTS error (sync wrapper): {e}")
     return None
-def _generate_speech_with_gpu(text: str):
     """Internal GPU-decorated function for TTS generation when TTS is available."""
     if config.global_tts_model is None:
         logger.info("[TTS] TTS model not loaded, initializing...")
@@ -418,13 +478,83 @@ def _generate_speech_with_gpu(text: str):
         logger.error("[TTS] TTS model not available. Please check dependencies.")
         return None
     try:
-        logger.info("[TTS] Running TTS generation...")
-        wav = config.global_tts_model.tts(text)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-            sf.write(tmp_file.name, wav, samplerate=22050)
-            logger.info(f"[TTS] ✅ Speech generated successfully: {tmp_file.name}")
-            return tmp_file.name
     except Exception as e:
         logger.error(f"[TTS] TTS error (local maya1): {e}")
         import traceback
@@ -452,15 +582,16 @@ def generate_speech(text: str):
     logger.info(f"[TTS] Generating speech for text: {text[:50]}...")
     # Check TTS availability first - avoid GPU allocation if not available
-    if not TTS_AVAILABLE:
-        logger.warning("[TTS] TTS library not installed. Trying MCP fallback...")
         # Try MCP-based TTS if available (doesn't require GPU)
         audio_path = _generate_speech_via_mcp(text)
         if audio_path:
             logger.info(f"[TTS] ✅ Generated via MCP fallback: {audio_path}")
             return audio_path
         else:
-            logger.error("[TTS] ❌ TTS library not installed and MCP fallback failed. Please install TTS: pip install TTS --no-deps && pip install coqui-tts")
             return None
     # TTS is available - use GPU-decorated function

 from logger import logger
 from client import MCP_AVAILABLE, call_agent, get_mcp_session, get_cached_mcp_tools
 import config
+from models import TTS_AVAILABLE, SNAC_AVAILABLE, WHISPER_AVAILABLE, initialize_tts_model, initialize_whisper_model
+# Maya1 constants (from maya1 docs)
+CODE_START_TOKEN_ID = 128257
+CODE_END_TOKEN_ID = 128258
+CODE_TOKEN_OFFSET = 128266
+SNAC_MIN_ID = 128266
+SNAC_MAX_ID = 156937
+SOH_ID = 128259
+EOH_ID = 128260
+SOA_ID = 128261
+TEXT_EOT_ID = 128009
+AUDIO_SAMPLE_RATE = 24000
+# Default voice description for Maya1
+DEFAULT_VOICE_DESCRIPTION = "Realistic male voice in the 30s age with a american accent. Normal pitch, warm timbre, conversational pacing, neutral tone delivery at medium intensity, podcast domain, narrator role, neutral delivery"
 import spaces
 try:
         logger.warning(f"MCP TTS error (sync wrapper): {e}")
     return None
+def build_maya1_prompt(tokenizer, description: str, text: str) -> str:
+    """Build formatted prompt for Maya1."""
+    soh_token = tokenizer.decode([SOH_ID])
+    eoh_token = tokenizer.decode([EOH_ID])
+    soa_token = tokenizer.decode([SOA_ID])
+    sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
+    eot_token = tokenizer.decode([TEXT_EOT_ID])
+    bos_token = tokenizer.bos_token
+    formatted_text = f'<description="{description}"> {text}'
+    prompt = (
+        soh_token + bos_token + formatted_text + eot_token +
+        eoh_token + soa_token + sos_token
+    )
+    return prompt
+def unpack_snac_from_7(snac_tokens: list) -> list:
+    """Unpack 7-token SNAC frames to 3 hierarchical levels."""
+    if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
+        snac_tokens = snac_tokens[:-1]
+    frames = len(snac_tokens) // 7
+    snac_tokens = snac_tokens[:frames * 7]
+    if frames == 0:
+        return [[], [], []]
+    l1, l2, l3 = [], [], []
+    for i in range(frames):
+        slots = snac_tokens[i*7:(i+1)*7]
+        l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
+        l2.extend([
+            (slots[1] - CODE_TOKEN_OFFSET) % 4096,
+            (slots[4] - CODE_TOKEN_OFFSET) % 4096,
+        ])
+        l3.extend([
+            (slots[2] - CODE_TOKEN_OFFSET) % 4096,
+            (slots[3] - CODE_TOKEN_OFFSET) % 4096,
+            (slots[5] - CODE_TOKEN_OFFSET) % 4096,
+            (slots[6] - CODE_TOKEN_OFFSET) % 4096,
+        ])
+    return [l1, l2, l3]
+def _generate_speech_with_gpu(text: str, description: str = None):
     """Internal GPU-decorated function for TTS generation when TTS is available."""
     if config.global_tts_model is None:
         logger.info("[TTS] TTS model not loaded, initializing...")
         logger.error("[TTS] TTS model not available. Please check dependencies.")
         return None
+    # Check if it's the new Maya1 format (dictionary) or old format
+    if not isinstance(config.global_tts_model, dict):
+        logger.error("[TTS] TTS model format is incorrect. Expected dictionary with model, tokenizer, snac_model.")
+        return None
     try:
+        model = config.global_tts_model["model"]
+        tokenizer = config.global_tts_model["tokenizer"]
+        snac_model = config.global_tts_model["snac_model"]
+        # Use default description if not provided
+        if description is None:
+            description = DEFAULT_VOICE_DESCRIPTION
+        logger.info("[TTS] Running Maya1 TTS generation...")
+        # Build prompt
+        prompt = build_maya1_prompt(tokenizer, description, text)
+        inputs = tokenizer(prompt, return_tensors="pt")
+        if torch.cuda.is_available():
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        # Generate tokens
+        with torch.inference_mode():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=1500,
+                min_new_tokens=28,
+                temperature=0.4,
+                top_p=0.9,
+                repetition_penalty=1.1,
+                do_sample=True,
+                eos_token_id=CODE_END_TOKEN_ID,
+                pad_token_id=tokenizer.pad_token_id,
+            )
+        # Extract SNAC tokens
+        generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
+        # Find EOS and extract SNAC codes
+        eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids)
+        snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]
+        if len(snac_tokens) < 7:
+            logger.error(f"[TTS] Not enough tokens generated ({len(snac_tokens)}). Try different text or increase max_tokens.")
+            return None
+        # Unpack and decode
+        levels = unpack_snac_from_7(snac_tokens)
+        frames = len(levels[0])
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        codes_tensor = [torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0) for level in levels]
+        with torch.inference_mode():
+            z_q = snac_model.quantizer.from_codes(codes_tensor)
+            audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
+        # Trim warmup
+        if len(audio) > 2048:
+            audio = audio[2048:]
+        # Convert to WAV and save to temporary file
+        audio_int16 = (audio * 32767).astype(np.int16)
+        # Create temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
+            tmp_path = tmp_file.name
+        # Save audio
+        sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE)
+        duration = len(audio) / AUDIO_SAMPLE_RATE
+        logger.info(f"[TTS] ✅ Speech generated successfully: {tmp_path} ({duration:.2f}s)")
+        return tmp_path
     except Exception as e:
         logger.error(f"[TTS] TTS error (local maya1): {e}")
         import traceback
     logger.info(f"[TTS] Generating speech for text: {text[:50]}...")
     # Check TTS availability first - avoid GPU allocation if not available
+    # Use SNAC_AVAILABLE for Maya1, but keep TTS_AVAILABLE check for backward compatibility
+    if not SNAC_AVAILABLE:
+        logger.warning("[TTS] SNAC library not installed (required for Maya1). Trying MCP fallback...")
         # Try MCP-based TTS if available (doesn't require GPU)
         audio_path = _generate_speech_via_mcp(text)
         if audio_path:
             logger.info(f"[TTS] ✅ Generated via MCP fallback: {audio_path}")
             return audio_path
         else:
+            logger.error("[TTS] ❌ SNAC library not installed and MCP fallback failed. Please install: pip install snac")
             return None
     # TTS is available - use GPU-decorated function