Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 13 days ago

Commit

cbd94a4

1 Parent(s): 3f90da8

refactor: address CodeRabbit review feedback for Mario integration

CRITICAL fix:
- Centralize sandbox library versions (SANDBOX_LIBRARIES constant)
- Use shared config for both Modal image and LLM prompts
- Prevent version mismatch between generated code and execution

MAJOR fixes:
- Use run_in_executor() for blocking Modal calls in async context
- Lazy initialize code executor to avoid failing on import
- Remove redundant hypothesis check (unreachable code)
- Simplify semantic search placeholder (dead code paths)

Minor fixes:
- Update docs: Docker → Modal Sandbox terminology
- Fix docstring claiming ConfigurationError (only warns)

Files changed (4) hide show

docs/workflow-diagrams.md +5 -5
examples/modal_demo/verify_sandbox.py +4 -4
src/agents/analysis_agent.py +32 -23
src/tools/code_execution.py +25 -8

docs/workflow-diagrams.md CHANGED Viewed

@@ -398,7 +398,7 @@ graph TB
         PubMed[PubMed API]
         ArXiv[arXiv API]
         BioRxiv[bioRxiv API]
-        Docker[Docker Sandbox]
         ChromaDB[(ChromaDB)]
     end
@@ -414,7 +414,7 @@ graph TB
     Server1 --> PubMed
     Server1 --> ArXiv
     Server1 --> BioRxiv
-    Server2 --> Docker
     Server3 --> ChromaDB
     style Manager fill:#ffe6e6
@@ -520,7 +520,7 @@ graph LR
     DC -->|Preprint search| ArXiv[arXiv API<br/>Scientific preprints]
     DC -->|Biology search| BioRxiv[bioRxiv API<br/>Biology preprints]
     DC -->|Agent reasoning| Claude[Claude API<br/>Sonnet 4 / Opus]
-    DC -->|Code execution| Docker[Docker Sandbox<br/>Safe Python env]
     DC -->|Vector storage| Chroma[ChromaDB<br/>Embeddings & RAG]
     DC -->|Deployed on| HF[HuggingFace Spaces<br/>Gradio 6.0]
@@ -529,7 +529,7 @@ graph LR
     ArXiv -->|Results| DC
     BioRxiv -->|Results| DC
     Claude -->|Responses| DC
-    Docker -->|Output| DC
     Chroma -->|Context| DC
     DC -->|Research report| User
@@ -540,7 +540,7 @@ graph LR
     style ArXiv fill:#e6f3ff
     style BioRxiv fill:#e6f3ff
     style Claude fill:#ffd6d6
-    style Docker fill:#f0f0f0
     style Chroma fill:#ffe6f0
     style HF fill:#d4edda
 ```

         PubMed[PubMed API]
         ArXiv[arXiv API]
         BioRxiv[bioRxiv API]
+        Modal[Modal Sandbox]
         ChromaDB[(ChromaDB)]
     end
     Server1 --> PubMed
     Server1 --> ArXiv
     Server1 --> BioRxiv
+    Server2 --> Modal
     Server3 --> ChromaDB
     style Manager fill:#ffe6e6
     DC -->|Preprint search| ArXiv[arXiv API<br/>Scientific preprints]
     DC -->|Biology search| BioRxiv[bioRxiv API<br/>Biology preprints]
     DC -->|Agent reasoning| Claude[Claude API<br/>Sonnet 4 / Opus]
+    DC -->|Code execution| Modal[Modal Sandbox<br/>Safe Python env]
     DC -->|Vector storage| Chroma[ChromaDB<br/>Embeddings & RAG]
     DC -->|Deployed on| HF[HuggingFace Spaces<br/>Gradio 6.0]
     ArXiv -->|Results| DC
     BioRxiv -->|Results| DC
     Claude -->|Responses| DC
+    Modal -->|Output| DC
     Chroma -->|Context| DC
     DC -->|Research report| User
     style ArXiv fill:#e6f3ff
     style BioRxiv fill:#e6f3ff
     style Claude fill:#ffd6d6
+    style Modal fill:#f0f0f0
     style Chroma fill:#ffe6f0
     style HF fill:#d4edda
 ```

examples/modal_demo/verify_sandbox.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-from src.tools.code_execution import get_code_executor
 def test_1_hostname_check():
@@ -145,9 +145,9 @@ print(f"statsmodels: {statsmodels.__version__}")
     # Check if versions match what we specified in code_execution.py
     expected_versions = {
-        "pandas: 2.2.0": True,
-        "numpy: 1.26.4": True,
-        "scipy: 1.11.4": True,
     }
     matches = 0

 sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from src.tools.code_execution import SANDBOX_LIBRARIES, get_code_executor
 def test_1_hostname_check():
     # Check if versions match what we specified in code_execution.py
     expected_versions = {
+        f"pandas: {SANDBOX_LIBRARIES['pandas']}": True,
+        f"numpy: {SANDBOX_LIBRARIES['numpy']}": True,
+        f"scipy: {SANDBOX_LIBRARIES['scipy']}": True,
     }
     matches = 0

src/agents/analysis_agent.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Analysis agent for statistical analysis using Modal code execution."""
 from collections.abc import AsyncIterable
 from typing import TYPE_CHECKING, Any
 from agent_framework import (
@@ -15,7 +17,11 @@ from pydantic import BaseModel, Field
 from pydantic_ai import Agent
 from src.agent_factory.judges import get_model
-from src.tools.code_execution import CodeExecutionError, get_code_executor
 from src.utils.models import Evidence
 if TYPE_CHECKING:
@@ -60,9 +66,15 @@ class AnalysisAgent(BaseAgent):  # type: ignore[misc]
         )
         self._evidence_store = evidence_store
         self._embeddings = embedding_service
-        self._code_executor = get_code_executor()
         self._agent: Agent[None, str] | None = None  # LLM for code generation
     def _get_agent(self) -> Agent[None, str]:
         """Lazy initialization of LLM agent."""
         if self._agent is None:
@@ -75,7 +87,8 @@ class AnalysisAgent(BaseAgent):  # type: ignore[misc]
     def _get_system_prompt(self) -> str:
         """System prompt for code generation."""
-        return """You are a biomedical data scientist specializing in statistical analysis.
 Your task: Generate Python code to analyze research evidence and test hypotheses.
@@ -89,12 +102,7 @@ Guidelines:
 7. Set a variable called 'result' with final verdict
 Available libraries:
-- pandas==2.2.0
-- numpy==1.26.4
-- scipy==1.11.4
-- matplotlib==3.8.2
-- scikit-learn==1.4.0
-- statsmodels==0.14.1
 Output format:
 Return ONLY executable Python code, no explanations or markdown.
@@ -119,10 +127,8 @@ Return ONLY executable Python code, no explanations or markdown.
         if not evidence:
             return self._error_response("No evidence available. Run SearchAgent first.")
-        # Get primary hypothesis
-        primary = hypotheses[0] if hypotheses else None
-        if not primary:
-            return self._error_response("No primary hypothesis found.")
         # Retrieve relevant evidence using RAG (if available)
         relevant_evidence = await self._retrieve_relevant_evidence(primary, evidence)
@@ -136,8 +142,12 @@ Return ONLY executable Python code, no explanations or markdown.
             code_result = await agent.run(code_prompt)
             generated_code = code_result.output
-            # Execute code in Modal sandbox
-            execution_result = self._code_executor.execute(generated_code, timeout=120)
             if not execution_result["success"]:
                 return self._error_response(f"Code execution failed: {execution_result['error']}")
@@ -167,14 +177,13 @@ Return ONLY executable Python code, no explanations or markdown.
     async def _retrieve_relevant_evidence(
         self, hypothesis: Any, all_evidence: list[Evidence]
     ) -> list[Evidence]:
-        """Retrieve most relevant evidence using RAG (if available)."""
-        if not self._embeddings:
-            # No RAG available, return top N evidence
-            return all_evidence[:10]
-        # Use embeddings to find relevant evidence
-        # TODO: Implement semantic search with embeddings service
-        # For now, just return all evidence
         return all_evidence[:10]
     def _create_code_generation_prompt(

 """Analysis agent for statistical analysis using Modal code execution."""
+import asyncio
 from collections.abc import AsyncIterable
+from functools import partial
 from typing import TYPE_CHECKING, Any
 from agent_framework import (
 from pydantic_ai import Agent
 from src.agent_factory.judges import get_model
+from src.tools.code_execution import (
+    CodeExecutionError,
+    get_code_executor,
+    get_sandbox_library_prompt,
+)
 from src.utils.models import Evidence
 if TYPE_CHECKING:
         )
         self._evidence_store = evidence_store
         self._embeddings = embedding_service
+        self._code_executor: Any = None  # Lazy initialized
         self._agent: Agent[None, str] | None = None  # LLM for code generation
+    def _get_code_executor(self) -> Any:
+        """Lazy initialization of code executor (avoids failing if Modal not configured)."""
+        if self._code_executor is None:
+            self._code_executor = get_code_executor()
+        return self._code_executor
     def _get_agent(self) -> Agent[None, str]:
         """Lazy initialization of LLM agent."""
         if self._agent is None:
     def _get_system_prompt(self) -> str:
         """System prompt for code generation."""
+        library_versions = get_sandbox_library_prompt()
+        return f"""You are a biomedical data scientist specializing in statistical analysis.
 Your task: Generate Python code to analyze research evidence and test hypotheses.
 7. Set a variable called 'result' with final verdict
 Available libraries:
+{library_versions}
 Output format:
 Return ONLY executable Python code, no explanations or markdown.
         if not evidence:
             return self._error_response("No evidence available. Run SearchAgent first.")
+        # Get primary hypothesis (guaranteed to exist after check above)
+        primary = hypotheses[0]
         # Retrieve relevant evidence using RAG (if available)
         relevant_evidence = await self._retrieve_relevant_evidence(primary, evidence)
             code_result = await agent.run(code_prompt)
             generated_code = code_result.output
+            # Execute code in Modal sandbox (run in thread to avoid blocking event loop)
+            loop = asyncio.get_running_loop()
+            executor = self._get_code_executor()
+            execution_result = await loop.run_in_executor(
+                None, partial(executor.execute, generated_code, timeout=120)
+            )
             if not execution_result["success"]:
                 return self._error_response(f"Code execution failed: {execution_result['error']}")
     async def _retrieve_relevant_evidence(
         self, hypothesis: Any, all_evidence: list[Evidence]
     ) -> list[Evidence]:
+        """Retrieve most relevant evidence using RAG (if available).
+        TODO: When embeddings service is available (self._embeddings),
+        use semantic search to find evidence most relevant to the hypothesis.
+        For now, returns top 10 evidence items.
+        """
+        # Future: Use self._embeddings for semantic search
         return all_evidence[:10]
     def _create_code_generation_prompt(

src/tools/code_execution.py CHANGED Viewed

@@ -11,6 +11,27 @@ import structlog
 logger = structlog.get_logger(__name__)
 class CodeExecutionError(Exception):
     """Raised when code execution fails."""
@@ -45,8 +66,9 @@ class ModalCodeExecutor:
     def __init__(self) -> None:
         """Initialize Modal code executor.
-        Raises:
-            ConfigurationError: If Modal credentials are not configured.
         """
         # Check for Modal credentials
         self.modal_token_id = os.getenv("MODAL_TOKEN_ID")
@@ -90,12 +112,7 @@ class ModalCodeExecutor:
             # Define scientific computing image with common libraries
             scientific_image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
-                "pandas==2.2.0",
-                "numpy==1.26.4",
-                "scipy==1.11.4",
-                "matplotlib==3.8.2",
-                "scikit-learn==1.4.0",
-                "statsmodels==0.14.1",
             )
             # Create sandbox with security restrictions

 logger = structlog.get_logger(__name__)
+# Shared library versions for Modal sandbox - used by both executor and LLM prompts
+# Keep these in sync to avoid version mismatch between generated code and execution
+SANDBOX_LIBRARIES: dict[str, str] = {
+    "pandas": "2.2.0",
+    "numpy": "1.26.4",
+    "scipy": "1.11.4",
+    "matplotlib": "3.8.2",
+    "scikit-learn": "1.4.0",
+    "statsmodels": "0.14.1",
+}
+def get_sandbox_library_list() -> list[str]:
+    """Get list of library==version strings for Modal image."""
+    return [f"{lib}=={ver}" for lib, ver in SANDBOX_LIBRARIES.items()]
+def get_sandbox_library_prompt() -> str:
+    """Get formatted library versions for LLM prompts."""
+    return "\n".join(f"- {lib}=={ver}" for lib, ver in SANDBOX_LIBRARIES.items())
 class CodeExecutionError(Exception):
     """Raised when code execution fails."""
     def __init__(self) -> None:
         """Initialize Modal code executor.
+        Note:
+            Logs a warning if Modal credentials are not configured.
+            Execution will fail at runtime without valid credentials.
         """
         # Check for Modal credentials
         self.modal_token_id = os.getenv("MODAL_TOKEN_ID")
             # Define scientific computing image with common libraries
             scientific_image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
+                *get_sandbox_library_list()
             )
             # Create sandbox with security restrictions