Spaces:

MCP-1st-Birthday
/

MedLLM-Agent

Running on Zero

App Files Files Community

Y Phung Nguyen commited on 25 days ago

Commit

2fffb9d

1 Parent(s): 98c58ec

Upd models loader #7

Browse files

Files changed (3) hide show

pipeline.py +2 -1
supervisor.py +3 -1
ui.py +61 -33

pipeline.py CHANGED Viewed

@@ -26,6 +26,7 @@ from supervisor import (
 )
 MAX_CLINICAL_QA_ROUNDS = 5
 _clinical_intake_sessions = {}
 _clinical_intake_lock = threading.Lock()
@@ -343,7 +344,7 @@ def _handle_clinical_answer(session_id: str, answer_text: str):
     return {"type": "question", "prompt": prompt}
-@spaces.GPU(max_duration=120)
 def stream_chat(
     message: str,
     history: list,

 )
 MAX_CLINICAL_QA_ROUNDS = 5
+MAX_DURATION = 120
 _clinical_intake_sessions = {}
 _clinical_intake_lock = threading.Lock()
     return {"type": "question", "prompt": prompt}
+@spaces.GPU(max_duration=MAX_DURATION)
 def stream_chat(
     message: str,
     history: list,

supervisor.py CHANGED Viewed

@@ -12,6 +12,8 @@ from utils import format_prompt_manually
 MAX_SUBTASKS = 3
 # Maximum number of search strategies
 MAX_SEARCH_STRATEGIES = 3
 try:
     import nest_asyncio
@@ -23,7 +25,7 @@ async def gemini_supervisor_breakdown_async(
     use_rag: bool,
     use_web_search: bool,
     time_elapsed: float,
-    max_duration: int = 120,
     previous_answer: str | None = None,
 ) -> dict:
     """Gemini Supervisor: Break user query into sub-topics.

 MAX_SUBTASKS = 3
 # Maximum number of search strategies
 MAX_SEARCH_STRATEGIES = 3
+# Maximum duration for GPU requests
+MAX_DURATION = 120
 try:
     import nest_asyncio
     use_rag: bool,
     use_web_search: bool,
     time_elapsed: float,
+    max_duration: int = MAX_DURATION,
     previous_answer: str | None = None,
 ) -> dict:
     """Gemini Supervisor: Break user query into sub-topics.

ui.py CHANGED Viewed

@@ -19,6 +19,7 @@ from models import (
 )
 from logger import logger
 def create_demo():
     """Create and return Gradio demo interface"""
@@ -292,7 +293,7 @@ def create_demo():
                 )
                 # GPU-decorated function to load any model (for user selection)
-                # @spaces.GPU(max_duration=120)
                 def load_model_with_gpu(model_name):
                     """Load medical model (GPU-decorated for ZeroGPU compatibility)"""
                     try:
@@ -406,7 +407,7 @@ def create_demo():
                 # GPU-decorated function to load ALL models sequentially on startup
                 # This prevents ZeroGPU conflicts from multiple simultaneous GPU requests
-                # @spaces.GPU(max_duration=120)
                 def load_all_models_on_startup():
                     """Load all models sequentially in a single GPU session to avoid ZeroGPU conflicts"""
                     import time
@@ -562,7 +563,7 @@ def create_demo():
                 )
                 # GPU-decorated function to load Whisper ASR model on-demand
-                @spaces.GPU(max_duration=120)
                 def load_whisper_model_on_demand():
                     """Load Whisper ASR model when needed"""
                     try:
@@ -610,37 +611,64 @@ def create_demo():
                     use_rag, medical_model_name, use_web_search,
                     enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request: gr.Request = None
                 ):
-                    try:
-                        # Check if model is currently loading (don't block if it's already loaded)
-                        loading_state = get_model_loading_state(medical_model_name)
-                        if loading_state == "loading" and not is_model_loaded(medical_model_name):
-                            error_msg = f"⏳ {medical_model_name} is still loading. Please wait until the model status shows 'loaded and ready' before sending messages."
-                            updated_history = history + [{"role": "assistant", "content": error_msg}]
-                            yield updated_history, ""
                             return
-                        # If request is None, create a mock request for compatibility
-                        if request is None:
-                            class MockRequest:
-                                session_hash = "anonymous"
-                            request = MockRequest()
-                        # Let stream_chat handle model loading (it's GPU-decorated and can load on-demand)
-                        for result in stream_chat(
-                            message, history, system_prompt, temperature, max_new_tokens,
-                            top_p, top_k, penalty, retriever_k, merge_threshold,
-                            use_rag, medical_model_name, use_web_search,
-                            enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
-                        ):
-                            yield result
-                    except Exception as e:
-                        # Handle any errors gracefully
-                        logger.error(f"Error in stream_chat_with_model_check: {e}")
-                        import traceback
-                        logger.debug(f"Full traceback: {traceback.format_exc()}")
-                        error_msg = f"⚠️ An error occurred: {str(e)[:200]}"
-                        updated_history = history + [{"role": "assistant", "content": error_msg}]
-                        yield updated_history, ""
                 submit_button.click(
                     fn=stream_chat_with_model_check,

 )
 from logger import logger
+MAX_DURATION = 120
 def create_demo():
     """Create and return Gradio demo interface"""
                 )
                 # GPU-decorated function to load any model (for user selection)
+                @spaces.GPU(max_duration=MAX_DURATION)
                 def load_model_with_gpu(model_name):
                     """Load medical model (GPU-decorated for ZeroGPU compatibility)"""
                     try:
                 # GPU-decorated function to load ALL models sequentially on startup
                 # This prevents ZeroGPU conflicts from multiple simultaneous GPU requests
+                @spaces.GPU(max_duration=MAX_DURATION)
                 def load_all_models_on_startup():
                     """Load all models sequentially in a single GPU session to avoid ZeroGPU conflicts"""
                     import time
                 )
                 # GPU-decorated function to load Whisper ASR model on-demand
+                @spaces.GPU(max_duration=MAX_DURATION)
                 def load_whisper_model_on_demand():
                     """Load Whisper ASR model when needed"""
                     try:
                     use_rag, medical_model_name, use_web_search,
                     enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request: gr.Request = None
                 ):
+                    import time
+                    max_retries = 2
+                    base_delay = 2.0
+                    for attempt in range(max_retries):
+                        try:
+                            # Check if model is currently loading (don't block if it's already loaded)
+                            loading_state = get_model_loading_state(medical_model_name)
+                            if loading_state == "loading" and not is_model_loaded(medical_model_name):
+                                error_msg = f"⏳ {medical_model_name} is still loading. Please wait until the model status shows 'loaded and ready' before sending messages."
+                                updated_history = history + [{"role": "assistant", "content": error_msg}]
+                                yield updated_history, ""
+                                return
+                            # If request is None, create a mock request for compatibility
+                            if request is None:
+                                class MockRequest:
+                                    session_hash = "anonymous"
+                                request = MockRequest()
+                            # Let stream_chat handle model loading (it's GPU-decorated and can load on-demand)
+                            for result in stream_chat(
+                                message, history, system_prompt, temperature, max_new_tokens,
+                                top_p, top_k, penalty, retriever_k, merge_threshold,
+                                use_rag, medical_model_name, use_web_search,
+                                enable_clinical_intake, disable_agentic_reasoning, show_thoughts, request
+                            ):
+                                yield result
+                            # If we get here, stream_chat completed successfully
                             return
+                        except Exception as e:
+                            error_msg_lower = str(e).lower()
+                            is_gpu_error = 'gpu task aborted' in error_msg_lower or 'gpu' in error_msg_lower or 'zerogpu' in error_msg_lower
+                            if is_gpu_error and attempt < max_retries - 1:
+                                delay = base_delay * (2 ** attempt)  # Exponential backoff: 2s, 4s
+                                logger.warning(f"[STREAM_CHAT] GPU task aborted (attempt {attempt + 1}/{max_retries}), retrying after {delay}s...")
+                                # Yield a message to user about retry
+                                retry_msg = f"⏳ GPU task was interrupted. Retrying in {delay}s... (attempt {attempt + 1}/{max_retries})"
+                                updated_history = history + [{"role": "assistant", "content": retry_msg}]
+                                yield updated_history, ""
+                                time.sleep(delay)
+                                continue
+                            else:
+                                # Final error handling
+                                logger.error(f"[STREAM_CHAT] Error in stream_chat_with_model_check: {e}")
+                                import traceback
+                                logger.error(f"[STREAM_CHAT] Full traceback: {traceback.format_exc()}")
+                                if is_gpu_error:
+                                    error_msg = f"⚠️ GPU task was aborted. This can happen if:\n- The request took too long\n- Multiple GPU requests conflicted\n- GPU quota was exceeded\n\nPlease try again or select a different model."
+                                else:
+                                    error_msg = f"⚠️ An error occurred: {str(e)[:200]}"
+                                updated_history = history + [{"role": "assistant", "content": error_msg}]
+                                yield updated_history, ""
+                                return
                 submit_button.click(
                     fn=stream_chat_with_model_check,