VibecoderMcSwaggins commited on
Commit
cbd94a4
·
1 Parent(s): 3f90da8

refactor: address CodeRabbit review feedback for Mario integration

Browse files

CRITICAL fix:
- Centralize sandbox library versions (SANDBOX_LIBRARIES constant)
- Use shared config for both Modal image and LLM prompts
- Prevent version mismatch between generated code and execution

MAJOR fixes:
- Use run_in_executor() for blocking Modal calls in async context
- Lazy initialize code executor to avoid failing on import
- Remove redundant hypothesis check (unreachable code)
- Simplify semantic search placeholder (dead code paths)

Minor fixes:
- Update docs: Docker → Modal Sandbox terminology
- Fix docstring claiming ConfigurationError (only warns)

docs/workflow-diagrams.md CHANGED
@@ -398,7 +398,7 @@ graph TB
398
  PubMed[PubMed API]
399
  ArXiv[arXiv API]
400
  BioRxiv[bioRxiv API]
401
- Docker[Docker Sandbox]
402
  ChromaDB[(ChromaDB)]
403
  end
404
 
@@ -414,7 +414,7 @@ graph TB
414
  Server1 --> PubMed
415
  Server1 --> ArXiv
416
  Server1 --> BioRxiv
417
- Server2 --> Docker
418
  Server3 --> ChromaDB
419
 
420
  style Manager fill:#ffe6e6
@@ -520,7 +520,7 @@ graph LR
520
  DC -->|Preprint search| ArXiv[arXiv API<br/>Scientific preprints]
521
  DC -->|Biology search| BioRxiv[bioRxiv API<br/>Biology preprints]
522
  DC -->|Agent reasoning| Claude[Claude API<br/>Sonnet 4 / Opus]
523
- DC -->|Code execution| Docker[Docker Sandbox<br/>Safe Python env]
524
  DC -->|Vector storage| Chroma[ChromaDB<br/>Embeddings & RAG]
525
 
526
  DC -->|Deployed on| HF[HuggingFace Spaces<br/>Gradio 6.0]
@@ -529,7 +529,7 @@ graph LR
529
  ArXiv -->|Results| DC
530
  BioRxiv -->|Results| DC
531
  Claude -->|Responses| DC
532
- Docker -->|Output| DC
533
  Chroma -->|Context| DC
534
 
535
  DC -->|Research report| User
@@ -540,7 +540,7 @@ graph LR
540
  style ArXiv fill:#e6f3ff
541
  style BioRxiv fill:#e6f3ff
542
  style Claude fill:#ffd6d6
543
- style Docker fill:#f0f0f0
544
  style Chroma fill:#ffe6f0
545
  style HF fill:#d4edda
546
  ```
 
398
  PubMed[PubMed API]
399
  ArXiv[arXiv API]
400
  BioRxiv[bioRxiv API]
401
+ Modal[Modal Sandbox]
402
  ChromaDB[(ChromaDB)]
403
  end
404
 
 
414
  Server1 --> PubMed
415
  Server1 --> ArXiv
416
  Server1 --> BioRxiv
417
+ Server2 --> Modal
418
  Server3 --> ChromaDB
419
 
420
  style Manager fill:#ffe6e6
 
520
  DC -->|Preprint search| ArXiv[arXiv API<br/>Scientific preprints]
521
  DC -->|Biology search| BioRxiv[bioRxiv API<br/>Biology preprints]
522
  DC -->|Agent reasoning| Claude[Claude API<br/>Sonnet 4 / Opus]
523
+ DC -->|Code execution| Modal[Modal Sandbox<br/>Safe Python env]
524
  DC -->|Vector storage| Chroma[ChromaDB<br/>Embeddings & RAG]
525
 
526
  DC -->|Deployed on| HF[HuggingFace Spaces<br/>Gradio 6.0]
 
529
  ArXiv -->|Results| DC
530
  BioRxiv -->|Results| DC
531
  Claude -->|Responses| DC
532
+ Modal -->|Output| DC
533
  Chroma -->|Context| DC
534
 
535
  DC -->|Research report| User
 
540
  style ArXiv fill:#e6f3ff
541
  style BioRxiv fill:#e6f3ff
542
  style Claude fill:#ffd6d6
543
+ style Modal fill:#f0f0f0
544
  style Chroma fill:#ffe6f0
545
  style HF fill:#d4edda
546
  ```
examples/modal_demo/verify_sandbox.py CHANGED
@@ -8,7 +8,7 @@ from pathlib import Path
8
 
9
  sys.path.insert(0, str(Path(__file__).parent.parent.parent))
10
 
11
- from src.tools.code_execution import get_code_executor
12
 
13
 
14
  def test_1_hostname_check():
@@ -145,9 +145,9 @@ print(f"statsmodels: {statsmodels.__version__}")
145
 
146
  # Check if versions match what we specified in code_execution.py
147
  expected_versions = {
148
- "pandas: 2.2.0": True,
149
- "numpy: 1.26.4": True,
150
- "scipy: 1.11.4": True,
151
  }
152
 
153
  matches = 0
 
8
 
9
  sys.path.insert(0, str(Path(__file__).parent.parent.parent))
10
 
11
+ from src.tools.code_execution import SANDBOX_LIBRARIES, get_code_executor
12
 
13
 
14
  def test_1_hostname_check():
 
145
 
146
  # Check if versions match what we specified in code_execution.py
147
  expected_versions = {
148
+ f"pandas: {SANDBOX_LIBRARIES['pandas']}": True,
149
+ f"numpy: {SANDBOX_LIBRARIES['numpy']}": True,
150
+ f"scipy: {SANDBOX_LIBRARIES['scipy']}": True,
151
  }
152
 
153
  matches = 0
src/agents/analysis_agent.py CHANGED
@@ -1,6 +1,8 @@
1
  """Analysis agent for statistical analysis using Modal code execution."""
2
 
 
3
  from collections.abc import AsyncIterable
 
4
  from typing import TYPE_CHECKING, Any
5
 
6
  from agent_framework import (
@@ -15,7 +17,11 @@ from pydantic import BaseModel, Field
15
  from pydantic_ai import Agent
16
 
17
  from src.agent_factory.judges import get_model
18
- from src.tools.code_execution import CodeExecutionError, get_code_executor
 
 
 
 
19
  from src.utils.models import Evidence
20
 
21
  if TYPE_CHECKING:
@@ -60,9 +66,15 @@ class AnalysisAgent(BaseAgent): # type: ignore[misc]
60
  )
61
  self._evidence_store = evidence_store
62
  self._embeddings = embedding_service
63
- self._code_executor = get_code_executor()
64
  self._agent: Agent[None, str] | None = None # LLM for code generation
65
 
 
 
 
 
 
 
66
  def _get_agent(self) -> Agent[None, str]:
67
  """Lazy initialization of LLM agent."""
68
  if self._agent is None:
@@ -75,7 +87,8 @@ class AnalysisAgent(BaseAgent): # type: ignore[misc]
75
 
76
  def _get_system_prompt(self) -> str:
77
  """System prompt for code generation."""
78
- return """You are a biomedical data scientist specializing in statistical analysis.
 
79
 
80
  Your task: Generate Python code to analyze research evidence and test hypotheses.
81
 
@@ -89,12 +102,7 @@ Guidelines:
89
  7. Set a variable called 'result' with final verdict
90
 
91
  Available libraries:
92
- - pandas==2.2.0
93
- - numpy==1.26.4
94
- - scipy==1.11.4
95
- - matplotlib==3.8.2
96
- - scikit-learn==1.4.0
97
- - statsmodels==0.14.1
98
 
99
  Output format:
100
  Return ONLY executable Python code, no explanations or markdown.
@@ -119,10 +127,8 @@ Return ONLY executable Python code, no explanations or markdown.
119
  if not evidence:
120
  return self._error_response("No evidence available. Run SearchAgent first.")
121
 
122
- # Get primary hypothesis
123
- primary = hypotheses[0] if hypotheses else None
124
- if not primary:
125
- return self._error_response("No primary hypothesis found.")
126
 
127
  # Retrieve relevant evidence using RAG (if available)
128
  relevant_evidence = await self._retrieve_relevant_evidence(primary, evidence)
@@ -136,8 +142,12 @@ Return ONLY executable Python code, no explanations or markdown.
136
  code_result = await agent.run(code_prompt)
137
  generated_code = code_result.output
138
 
139
- # Execute code in Modal sandbox
140
- execution_result = self._code_executor.execute(generated_code, timeout=120)
 
 
 
 
141
 
142
  if not execution_result["success"]:
143
  return self._error_response(f"Code execution failed: {execution_result['error']}")
@@ -167,14 +177,13 @@ Return ONLY executable Python code, no explanations or markdown.
167
  async def _retrieve_relevant_evidence(
168
  self, hypothesis: Any, all_evidence: list[Evidence]
169
  ) -> list[Evidence]:
170
- """Retrieve most relevant evidence using RAG (if available)."""
171
- if not self._embeddings:
172
- # No RAG available, return top N evidence
173
- return all_evidence[:10]
174
-
175
- # Use embeddings to find relevant evidence
176
- # TODO: Implement semantic search with embeddings service
177
- # For now, just return all evidence
178
  return all_evidence[:10]
179
 
180
  def _create_code_generation_prompt(
 
1
  """Analysis agent for statistical analysis using Modal code execution."""
2
 
3
+ import asyncio
4
  from collections.abc import AsyncIterable
5
+ from functools import partial
6
  from typing import TYPE_CHECKING, Any
7
 
8
  from agent_framework import (
 
17
  from pydantic_ai import Agent
18
 
19
  from src.agent_factory.judges import get_model
20
+ from src.tools.code_execution import (
21
+ CodeExecutionError,
22
+ get_code_executor,
23
+ get_sandbox_library_prompt,
24
+ )
25
  from src.utils.models import Evidence
26
 
27
  if TYPE_CHECKING:
 
66
  )
67
  self._evidence_store = evidence_store
68
  self._embeddings = embedding_service
69
+ self._code_executor: Any = None # Lazy initialized
70
  self._agent: Agent[None, str] | None = None # LLM for code generation
71
 
72
+ def _get_code_executor(self) -> Any:
73
+ """Lazy initialization of code executor (avoids failing if Modal not configured)."""
74
+ if self._code_executor is None:
75
+ self._code_executor = get_code_executor()
76
+ return self._code_executor
77
+
78
  def _get_agent(self) -> Agent[None, str]:
79
  """Lazy initialization of LLM agent."""
80
  if self._agent is None:
 
87
 
88
  def _get_system_prompt(self) -> str:
89
  """System prompt for code generation."""
90
+ library_versions = get_sandbox_library_prompt()
91
+ return f"""You are a biomedical data scientist specializing in statistical analysis.
92
 
93
  Your task: Generate Python code to analyze research evidence and test hypotheses.
94
 
 
102
  7. Set a variable called 'result' with final verdict
103
 
104
  Available libraries:
105
+ {library_versions}
 
 
 
 
 
106
 
107
  Output format:
108
  Return ONLY executable Python code, no explanations or markdown.
 
127
  if not evidence:
128
  return self._error_response("No evidence available. Run SearchAgent first.")
129
 
130
+ # Get primary hypothesis (guaranteed to exist after check above)
131
+ primary = hypotheses[0]
 
 
132
 
133
  # Retrieve relevant evidence using RAG (if available)
134
  relevant_evidence = await self._retrieve_relevant_evidence(primary, evidence)
 
142
  code_result = await agent.run(code_prompt)
143
  generated_code = code_result.output
144
 
145
+ # Execute code in Modal sandbox (run in thread to avoid blocking event loop)
146
+ loop = asyncio.get_running_loop()
147
+ executor = self._get_code_executor()
148
+ execution_result = await loop.run_in_executor(
149
+ None, partial(executor.execute, generated_code, timeout=120)
150
+ )
151
 
152
  if not execution_result["success"]:
153
  return self._error_response(f"Code execution failed: {execution_result['error']}")
 
177
  async def _retrieve_relevant_evidence(
178
  self, hypothesis: Any, all_evidence: list[Evidence]
179
  ) -> list[Evidence]:
180
+ """Retrieve most relevant evidence using RAG (if available).
181
+
182
+ TODO: When embeddings service is available (self._embeddings),
183
+ use semantic search to find evidence most relevant to the hypothesis.
184
+ For now, returns top 10 evidence items.
185
+ """
186
+ # Future: Use self._embeddings for semantic search
 
187
  return all_evidence[:10]
188
 
189
  def _create_code_generation_prompt(
src/tools/code_execution.py CHANGED
@@ -11,6 +11,27 @@ import structlog
11
 
12
  logger = structlog.get_logger(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  class CodeExecutionError(Exception):
16
  """Raised when code execution fails."""
@@ -45,8 +66,9 @@ class ModalCodeExecutor:
45
  def __init__(self) -> None:
46
  """Initialize Modal code executor.
47
 
48
- Raises:
49
- ConfigurationError: If Modal credentials are not configured.
 
50
  """
51
  # Check for Modal credentials
52
  self.modal_token_id = os.getenv("MODAL_TOKEN_ID")
@@ -90,12 +112,7 @@ class ModalCodeExecutor:
90
 
91
  # Define scientific computing image with common libraries
92
  scientific_image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
93
- "pandas==2.2.0",
94
- "numpy==1.26.4",
95
- "scipy==1.11.4",
96
- "matplotlib==3.8.2",
97
- "scikit-learn==1.4.0",
98
- "statsmodels==0.14.1",
99
  )
100
 
101
  # Create sandbox with security restrictions
 
11
 
12
  logger = structlog.get_logger(__name__)
13
 
14
+ # Shared library versions for Modal sandbox - used by both executor and LLM prompts
15
+ # Keep these in sync to avoid version mismatch between generated code and execution
16
+ SANDBOX_LIBRARIES: dict[str, str] = {
17
+ "pandas": "2.2.0",
18
+ "numpy": "1.26.4",
19
+ "scipy": "1.11.4",
20
+ "matplotlib": "3.8.2",
21
+ "scikit-learn": "1.4.0",
22
+ "statsmodels": "0.14.1",
23
+ }
24
+
25
+
26
+ def get_sandbox_library_list() -> list[str]:
27
+ """Get list of library==version strings for Modal image."""
28
+ return [f"{lib}=={ver}" for lib, ver in SANDBOX_LIBRARIES.items()]
29
+
30
+
31
+ def get_sandbox_library_prompt() -> str:
32
+ """Get formatted library versions for LLM prompts."""
33
+ return "\n".join(f"- {lib}=={ver}" for lib, ver in SANDBOX_LIBRARIES.items())
34
+
35
 
36
  class CodeExecutionError(Exception):
37
  """Raised when code execution fails."""
 
66
  def __init__(self) -> None:
67
  """Initialize Modal code executor.
68
 
69
+ Note:
70
+ Logs a warning if Modal credentials are not configured.
71
+ Execution will fail at runtime without valid credentials.
72
  """
73
  # Check for Modal credentials
74
  self.modal_token_id = os.getenv("MODAL_TOKEN_ID")
 
112
 
113
  # Define scientific computing image with common libraries
114
  scientific_image = modal.Image.debian_slim(python_version="3.11").uv_pip_install(
115
+ *get_sandbox_library_list()
 
 
 
 
 
116
  )
117
 
118
  # Create sandbox with security restrictions