VibecoderMcSwaggins commited on
Commit
e67c99f
·
unverified ·
1 Parent(s): 31809fa

feat: Phase 11 bioRxiv Preprint Integration (#19)

Browse files

* feat: Implement Phase 11 (bioRxiv Preprint Integration)

* chore: add .mypy_cache to .gitignore

* fix: address CodeRabbit review feedback for bioRxiv tool

- Catch httpx.RequestError for network failures (major issue)
- Use SourceName type alias in Citation model (DRY)
- Only add '...' suffix when abstract is truncated
- Add test for network error handling

.gitignore CHANGED
@@ -63,5 +63,6 @@ logs/
63
 
64
  # Testing
65
  .pytest_cache/
 
66
  .coverage
67
  htmlcov/
 
63
 
64
  # Testing
65
  .pytest_cache/
66
+ .mypy_cache/
67
  .coverage
68
  htmlcov/
examples/search_demo/run_search.py CHANGED
@@ -21,6 +21,7 @@ Requirements:
21
  import asyncio
22
  import sys
23
 
 
24
  from src.tools.clinicaltrials import ClinicalTrialsTool
25
  from src.tools.pubmed import PubMedTool
26
  from src.tools.search_handler import SearchHandler
@@ -36,10 +37,11 @@ async def main(query: str) -> None:
36
  # Initialize tools
37
  pubmed = PubMedTool()
38
  trials = ClinicalTrialsTool()
39
- handler = SearchHandler(tools=[pubmed, trials], timeout=30.0)
 
40
 
41
  # Execute search
42
- print("Searching PubMed and ClinicalTrials.gov in parallel...")
43
  result = await handler.execute(query, max_results_per_tool=5)
44
 
45
  # Display results
 
21
  import asyncio
22
  import sys
23
 
24
+ from src.tools.biorxiv import BioRxivTool
25
  from src.tools.clinicaltrials import ClinicalTrialsTool
26
  from src.tools.pubmed import PubMedTool
27
  from src.tools.search_handler import SearchHandler
 
37
  # Initialize tools
38
  pubmed = PubMedTool()
39
  trials = ClinicalTrialsTool()
40
+ preprints = BioRxivTool()
41
+ handler = SearchHandler(tools=[pubmed, trials, preprints], timeout=30.0)
42
 
43
  # Execute search
44
+ print("Searching PubMed, ClinicalTrials.gov, and bioRxiv in parallel...")
45
  result = await handler.execute(query, max_results_per_tool=5)
46
 
47
  # Display results
src/app.py CHANGED
@@ -8,6 +8,7 @@ import gradio as gr
8
 
9
  from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
10
  from src.orchestrator_factory import create_orchestrator
 
11
  from src.tools.clinicaltrials import ClinicalTrialsTool
12
  from src.tools.pubmed import PubMedTool
13
  from src.tools.search_handler import SearchHandler
@@ -33,7 +34,7 @@ def configure_orchestrator(use_mock: bool = False, mode: str = "simple") -> Any:
33
 
34
  # Create search tools
35
  search_handler = SearchHandler(
36
- tools=[PubMedTool(), ClinicalTrialsTool()],
37
  timeout=config.search_timeout,
38
  )
39
 
@@ -128,7 +129,7 @@ def create_demo() -> Any:
128
  ## AI-Powered Drug Repurposing Research Agent
129
 
130
  Ask questions about potential drug repurposing opportunities.
131
- The agent searches PubMed & ClinicalTrials.gov to provide recommendations.
132
 
133
  **Example questions:**
134
  - "What drugs could be repurposed for Alzheimer's disease?"
@@ -161,7 +162,7 @@ def create_demo() -> Any:
161
  **Note**: This is a research tool and should not be used for medical decisions.
162
  Always consult healthcare professionals for medical advice.
163
 
164
- Built with 🤖 PydanticAI + 🔬 PubMed & ClinicalTrials.gov
165
  """)
166
 
167
  return demo
 
8
 
9
  from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
10
  from src.orchestrator_factory import create_orchestrator
11
+ from src.tools.biorxiv import BioRxivTool
12
  from src.tools.clinicaltrials import ClinicalTrialsTool
13
  from src.tools.pubmed import PubMedTool
14
  from src.tools.search_handler import SearchHandler
 
34
 
35
  # Create search tools
36
  search_handler = SearchHandler(
37
+ tools=[PubMedTool(), ClinicalTrialsTool(), BioRxivTool()],
38
  timeout=config.search_timeout,
39
  )
40
 
 
129
  ## AI-Powered Drug Repurposing Research Agent
130
 
131
  Ask questions about potential drug repurposing opportunities.
132
+ The agent searches PubMed, ClinicalTrials.gov, and bioRxiv/medRxiv preprints.
133
 
134
  **Example questions:**
135
  - "What drugs could be repurposed for Alzheimer's disease?"
 
162
  **Note**: This is a research tool and should not be used for medical decisions.
163
  Always consult healthcare professionals for medical advice.
164
 
165
+ Built with 🤖 PydanticAI + 🔬 PubMed, ClinicalTrials.gov & bioRxiv
166
  """)
167
 
168
  return demo
src/tools/biorxiv.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """bioRxiv/medRxiv preprint search tool."""
2
+
3
+ import re
4
+ from datetime import datetime, timedelta
5
+ from typing import Any
6
+
7
+ import httpx
8
+ from tenacity import retry, stop_after_attempt, wait_exponential
9
+
10
+ from src.utils.exceptions import SearchError
11
+ from src.utils.models import Citation, Evidence
12
+
13
+
14
+ class BioRxivTool:
15
+ """Search tool for bioRxiv and medRxiv preprints."""
16
+
17
+ BASE_URL = "https://api.biorxiv.org/details"
18
+ # Use medRxiv for medical/clinical content (more relevant for drug repurposing)
19
+ DEFAULT_SERVER = "medrxiv"
20
+ # Fetch papers from last N days
21
+ DEFAULT_DAYS = 90
22
+
23
+ def __init__(self, server: str = DEFAULT_SERVER, days: int = DEFAULT_DAYS) -> None:
24
+ """
25
+ Initialize bioRxiv tool.
26
+
27
+ Args:
28
+ server: "biorxiv" or "medrxiv"
29
+ days: How many days back to search
30
+ """
31
+ self.server = server
32
+ self.days = days
33
+
34
+ @property
35
+ def name(self) -> str:
36
+ return "biorxiv"
37
+
38
+ @retry(
39
+ stop=stop_after_attempt(3),
40
+ wait=wait_exponential(multiplier=1, min=1, max=10),
41
+ reraise=True,
42
+ )
43
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
44
+ """
45
+ Search bioRxiv/medRxiv for preprints matching query.
46
+
47
+ Note: bioRxiv API doesn't support keyword search directly.
48
+ We fetch recent papers and filter client-side.
49
+
50
+ Args:
51
+ query: Search query (keywords)
52
+ max_results: Maximum results to return
53
+
54
+ Returns:
55
+ List of Evidence objects from preprints
56
+ """
57
+ # Build date range for last N days
58
+ end_date = datetime.now().strftime("%Y-%m-%d")
59
+ start_date = (datetime.now() - timedelta(days=self.days)).strftime("%Y-%m-%d")
60
+ interval = f"{start_date}/{end_date}"
61
+
62
+ # Fetch recent papers
63
+ url = f"{self.BASE_URL}/{self.server}/{interval}/0/json"
64
+
65
+ async with httpx.AsyncClient(timeout=30.0) as client:
66
+ try:
67
+ response = await client.get(url)
68
+ response.raise_for_status()
69
+ except httpx.HTTPStatusError as e:
70
+ raise SearchError(f"bioRxiv search failed: {e}") from e
71
+ except httpx.RequestError as e:
72
+ raise SearchError(f"bioRxiv connection failed: {e}") from e
73
+
74
+ data = response.json()
75
+ papers = data.get("collection", [])
76
+
77
+ # Filter papers by query keywords
78
+ query_terms = self._extract_terms(query)
79
+ matching = self._filter_by_keywords(papers, query_terms, max_results)
80
+
81
+ return [self._paper_to_evidence(paper) for paper in matching]
82
+
83
+ def _extract_terms(self, query: str) -> list[str]:
84
+ """Extract search terms from query."""
85
+ # Simple tokenization, lowercase
86
+ terms = re.findall(r"\b\w+\b", query.lower())
87
+ # Filter out common stop words
88
+ stop_words = {"the", "a", "an", "in", "on", "for", "and", "or", "of", "to"}
89
+ return [t for t in terms if t not in stop_words and len(t) > 2]
90
+
91
+ def _filter_by_keywords(
92
+ self, papers: list[dict[str, Any]], terms: list[str], max_results: int
93
+ ) -> list[dict[str, Any]]:
94
+ """Filter papers that contain query terms in title or abstract."""
95
+ scored_papers = []
96
+
97
+ for paper in papers:
98
+ title = paper.get("title", "").lower()
99
+ abstract = paper.get("abstract", "").lower()
100
+ text = f"{title} {abstract}"
101
+
102
+ # Count matching terms
103
+ matches = sum(1 for term in terms if term in text)
104
+
105
+ if matches > 0:
106
+ scored_papers.append((matches, paper))
107
+
108
+ # Sort by match count (descending)
109
+ scored_papers.sort(key=lambda x: x[0], reverse=True)
110
+
111
+ return [paper for _, paper in scored_papers[:max_results]]
112
+
113
+ def _paper_to_evidence(self, paper: dict[str, Any]) -> Evidence:
114
+ """Convert a preprint paper to Evidence."""
115
+ doi = paper.get("doi", "")
116
+ title = paper.get("title", "Untitled")
117
+ authors_str = paper.get("authors", "Unknown")
118
+ date = paper.get("date", "Unknown")
119
+ abstract = paper.get("abstract", "No abstract available.")
120
+ category = paper.get("category", "")
121
+
122
+ # Parse authors (format: "Smith, J; Jones, A")
123
+ authors = [a.strip() for a in authors_str.split(";")][:5]
124
+
125
+ # Truncate abstract if needed
126
+ truncated_abstract = abstract[:1800]
127
+ suffix = "..." if len(abstract) > 1800 else ""
128
+
129
+ # Note this is a preprint in the content
130
+ content = (
131
+ f"[PREPRINT - Not peer-reviewed] {truncated_abstract}{suffix} Category: {category}."
132
+ )
133
+
134
+ return Evidence(
135
+ content=content[:2000],
136
+ citation=Citation(
137
+ source="biorxiv",
138
+ title=title[:500],
139
+ url=f"https://doi.org/{doi}" if doi else "https://www.medrxiv.org/",
140
+ date=date,
141
+ authors=authors,
142
+ ),
143
+ relevance=0.75, # Slightly lower than peer-reviewed
144
+ )
src/utils/models.py CHANGED
@@ -6,13 +6,14 @@ from typing import Any, ClassVar, Literal
6
  from pydantic import BaseModel, Field
7
 
8
  # Centralized source type - add new sources here (e.g., "biorxiv" in Phase 11)
9
- SourceName = Literal["pubmed", "clinicaltrials"]
10
 
11
 
12
  class Citation(BaseModel):
13
  """A citation to a source document."""
14
 
15
  source: SourceName = Field(description="Where this came from")
 
16
  title: str = Field(min_length=1, max_length=500)
17
  url: str = Field(description="URL to the source")
18
  date: str = Field(description="Publication date (YYYY-MM-DD or 'Unknown')")
 
6
  from pydantic import BaseModel, Field
7
 
8
  # Centralized source type - add new sources here (e.g., "biorxiv" in Phase 11)
9
+ SourceName = Literal["pubmed", "clinicaltrials", "biorxiv"]
10
 
11
 
12
  class Citation(BaseModel):
13
  """A citation to a source document."""
14
 
15
  source: SourceName = Field(description="Where this came from")
16
+
17
  title: str = Field(min_length=1, max_length=500)
18
  url: str = Field(description="URL to the source")
19
  date: str = Field(description="Publication date (YYYY-MM-DD or 'Unknown')")
tests/unit/tools/test_biorxiv.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for bioRxiv tool."""
2
+
3
+ import pytest
4
+ import respx
5
+ from httpx import Response
6
+
7
+ from src.tools.biorxiv import BioRxivTool
8
+ from src.utils.models import Evidence
9
+
10
+
11
+ @pytest.fixture
12
+ def mock_biorxiv_response():
13
+ """Mock bioRxiv API response."""
14
+ return {
15
+ "collection": [
16
+ {
17
+ "doi": "10.1101/2024.01.15.24301234",
18
+ "title": "Metformin repurposing for Alzheimer's disease: a systematic review",
19
+ "authors": "Smith, John; Jones, Alice; Brown, Bob",
20
+ "date": "2024-01-15",
21
+ "category": "neurology",
22
+ "abstract": "Background: Metformin has shown neuroprotective effects. "
23
+ "We conducted a systematic review of metformin's potential "
24
+ "for Alzheimer's disease treatment.",
25
+ },
26
+ {
27
+ "doi": "10.1101/2024.01.10.24301111",
28
+ "title": "COVID-19 vaccine efficacy study",
29
+ "authors": "Wilson, C",
30
+ "date": "2024-01-10",
31
+ "category": "infectious diseases",
32
+ "abstract": "This study evaluates COVID-19 vaccine efficacy.",
33
+ },
34
+ ],
35
+ "messages": [{"status": "ok", "count": 2}],
36
+ }
37
+
38
+
39
+ class TestBioRxivTool:
40
+ """Tests for BioRxivTool."""
41
+
42
+ def test_tool_name(self):
43
+ """Tool should have correct name."""
44
+ tool = BioRxivTool()
45
+ assert tool.name == "biorxiv"
46
+
47
+ def test_default_server_is_medrxiv(self):
48
+ """Default server should be medRxiv for medical relevance."""
49
+ tool = BioRxivTool()
50
+ assert tool.server == "medrxiv"
51
+
52
+ @pytest.mark.asyncio
53
+ @respx.mock
54
+ async def test_search_returns_evidence(self, mock_biorxiv_response):
55
+ """Search should return Evidence objects."""
56
+ respx.get(url__startswith="https://api.biorxiv.org/details").mock(
57
+ return_value=Response(200, json=mock_biorxiv_response)
58
+ )
59
+
60
+ tool = BioRxivTool()
61
+ results = await tool.search("metformin alzheimer", max_results=5)
62
+
63
+ assert len(results) == 1 # Only the matching paper
64
+ assert isinstance(results[0], Evidence)
65
+ assert results[0].citation.source == "biorxiv"
66
+ assert "metformin" in results[0].citation.title.lower()
67
+
68
+ @pytest.mark.asyncio
69
+ @respx.mock
70
+ async def test_search_filters_by_keywords(self, mock_biorxiv_response):
71
+ """Search should filter papers by query keywords."""
72
+ respx.get(url__startswith="https://api.biorxiv.org/details").mock(
73
+ return_value=Response(200, json=mock_biorxiv_response)
74
+ )
75
+
76
+ tool = BioRxivTool()
77
+
78
+ # Search for metformin - should match first paper
79
+ results = await tool.search("metformin")
80
+ assert len(results) == 1
81
+ assert "metformin" in results[0].citation.title.lower()
82
+
83
+ # Search for COVID - should match second paper
84
+ results = await tool.search("covid vaccine")
85
+ assert len(results) == 1
86
+ assert "covid" in results[0].citation.title.lower()
87
+
88
+ @pytest.mark.asyncio
89
+ @respx.mock
90
+ async def test_search_marks_as_preprint(self, mock_biorxiv_response):
91
+ """Evidence content should note it's a preprint."""
92
+ respx.get(url__startswith="https://api.biorxiv.org/details").mock(
93
+ return_value=Response(200, json=mock_biorxiv_response)
94
+ )
95
+
96
+ tool = BioRxivTool()
97
+ results = await tool.search("metformin")
98
+
99
+ assert "PREPRINT" in results[0].content
100
+ assert "Not peer-reviewed" in results[0].content
101
+
102
+ @pytest.mark.asyncio
103
+ @respx.mock
104
+ async def test_search_empty_results(self):
105
+ """Search should handle empty results gracefully."""
106
+ respx.get(url__startswith="https://api.biorxiv.org/details").mock(
107
+ return_value=Response(200, json={"collection": [], "messages": []})
108
+ )
109
+
110
+ tool = BioRxivTool()
111
+ results = await tool.search("xyznonexistent")
112
+
113
+ assert results == []
114
+
115
+ @pytest.mark.asyncio
116
+ @respx.mock
117
+ async def test_search_api_error(self):
118
+ """Search should raise SearchError on API failure."""
119
+ from src.utils.exceptions import SearchError
120
+
121
+ respx.get(url__startswith="https://api.biorxiv.org/details").mock(
122
+ return_value=Response(500, text="Internal Server Error")
123
+ )
124
+
125
+ tool = BioRxivTool()
126
+
127
+ with pytest.raises(SearchError):
128
+ await tool.search("metformin")
129
+
130
+ @pytest.mark.asyncio
131
+ @respx.mock
132
+ async def test_search_network_error(self):
133
+ """Search should raise SearchError on network failure."""
134
+ import httpx
135
+
136
+ from src.utils.exceptions import SearchError
137
+
138
+ respx.get(url__startswith="https://api.biorxiv.org/details").mock(
139
+ side_effect=httpx.RequestError("Network connection failed")
140
+ )
141
+
142
+ tool = BioRxivTool()
143
+
144
+ with pytest.raises(SearchError) as exc_info:
145
+ await tool.search("metformin")
146
+
147
+ assert "connection failed" in str(exc_info.value)
148
+
149
+ def test_extract_terms(self):
150
+ """Should extract meaningful search terms."""
151
+ tool = BioRxivTool()
152
+
153
+ terms = tool._extract_terms("metformin for Alzheimer's disease")
154
+
155
+ assert "metformin" in terms
156
+ assert "alzheimer" in terms
157
+ assert "disease" in terms
158
+ assert "for" not in terms # Stop word
159
+ assert "the" not in terms # Stop word
160
+
161
+
162
+ class TestBioRxivIntegration:
163
+ """Integration tests (marked for separate run)."""
164
+
165
+ @pytest.mark.integration
166
+ @pytest.mark.asyncio
167
+ async def test_real_api_call(self):
168
+ """Test actual API call (requires network)."""
169
+ tool = BioRxivTool(days=30) # Last 30 days
170
+ results = await tool.search("diabetes", max_results=3)
171
+
172
+ # May or may not find results depending on recent papers
173
+ # But we want to ensure the code runs without crashing
174
+ assert isinstance(results, list)
175
+ if results:
176
+ r = results[0]
177
+ assert isinstance(r, Evidence)
178
+ assert r.citation.source == "biorxiv"