Spaces:

DataQuests
/

DeepCritical

Running

App Files Files Community

VibecoderMcSwaggins commited on 12 days ago

Commit

e67c99f

unverified ·

1 Parent(s): 31809fa

feat: Phase 11 bioRxiv Preprint Integration (#19)

Browse files

* feat: Implement Phase 11 (bioRxiv Preprint Integration)

* chore: add .mypy_cache to .gitignore

* fix: address CodeRabbit review feedback for bioRxiv tool

- Catch httpx.RequestError for network failures (major issue)
- Use SourceName type alias in Citation model (DRY)
- Only add '...' suffix when abstract is truncated
- Add test for network error handling

Files changed (6) hide show

.gitignore +1 -0
examples/search_demo/run_search.py +4 -2
src/app.py +4 -3
src/tools/biorxiv.py +144 -0
src/utils/models.py +2 -1
tests/unit/tools/test_biorxiv.py +178 -0

.gitignore CHANGED Viewed

@@ -63,5 +63,6 @@ logs/
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/

 # Testing
 .pytest_cache/
+.mypy_cache/
 .coverage
 htmlcov/

examples/search_demo/run_search.py CHANGED Viewed

@@ -21,6 +21,7 @@ Requirements:
 import asyncio
 import sys
 from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.pubmed import PubMedTool
 from src.tools.search_handler import SearchHandler
@@ -36,10 +37,11 @@ async def main(query: str) -> None:
     # Initialize tools
     pubmed = PubMedTool()
     trials = ClinicalTrialsTool()
-    handler = SearchHandler(tools=[pubmed, trials], timeout=30.0)
     # Execute search
-    print("Searching PubMed and ClinicalTrials.gov in parallel...")
     result = await handler.execute(query, max_results_per_tool=5)
     # Display results

 import asyncio
 import sys
+from src.tools.biorxiv import BioRxivTool
 from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.pubmed import PubMedTool
 from src.tools.search_handler import SearchHandler
     # Initialize tools
     pubmed = PubMedTool()
     trials = ClinicalTrialsTool()
+    preprints = BioRxivTool()
+    handler = SearchHandler(tools=[pubmed, trials, preprints], timeout=30.0)
     # Execute search
+    print("Searching PubMed, ClinicalTrials.gov, and bioRxiv in parallel...")
     result = await handler.execute(query, max_results_per_tool=5)
     # Display results

src/app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import gradio as gr
 from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
 from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.pubmed import PubMedTool
 from src.tools.search_handler import SearchHandler
@@ -33,7 +34,7 @@ def configure_orchestrator(use_mock: bool = False, mode: str = "simple") -> Any:
     # Create search tools
     search_handler = SearchHandler(
-        tools=[PubMedTool(), ClinicalTrialsTool()],
         timeout=config.search_timeout,
     )
@@ -128,7 +129,7 @@ def create_demo() -> Any:
         ## AI-Powered Drug Repurposing Research Agent
         Ask questions about potential drug repurposing opportunities.
-        The agent searches PubMed & ClinicalTrials.gov to provide recommendations.
         **Example questions:**
         - "What drugs could be repurposed for Alzheimer's disease?"
@@ -161,7 +162,7 @@ def create_demo() -> Any:
         **Note**: This is a research tool and should not be used for medical decisions.
         Always consult healthcare professionals for medical advice.
-        Built with 🤖 PydanticAI + 🔬 PubMed & ClinicalTrials.gov
         """)
     return demo

 from src.agent_factory.judges import JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
+from src.tools.biorxiv import BioRxivTool
 from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.pubmed import PubMedTool
 from src.tools.search_handler import SearchHandler
     # Create search tools
     search_handler = SearchHandler(
+        tools=[PubMedTool(), ClinicalTrialsTool(), BioRxivTool()],
         timeout=config.search_timeout,
     )
         ## AI-Powered Drug Repurposing Research Agent
         Ask questions about potential drug repurposing opportunities.
+        The agent searches PubMed, ClinicalTrials.gov, and bioRxiv/medRxiv preprints.
         **Example questions:**
         - "What drugs could be repurposed for Alzheimer's disease?"
         **Note**: This is a research tool and should not be used for medical decisions.
         Always consult healthcare professionals for medical advice.
+        Built with 🤖 PydanticAI + 🔬 PubMed, ClinicalTrials.gov & bioRxiv
         """)
     return demo

src/tools/biorxiv.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""bioRxiv/medRxiv preprint search tool."""
+import re
+from datetime import datetime, timedelta
+from typing import Any
+import httpx
+from tenacity import retry, stop_after_attempt, wait_exponential
+from src.utils.exceptions import SearchError
+from src.utils.models import Citation, Evidence
+class BioRxivTool:
+    """Search tool for bioRxiv and medRxiv preprints."""
+    BASE_URL = "https://api.biorxiv.org/details"
+    # Use medRxiv for medical/clinical content (more relevant for drug repurposing)
+    DEFAULT_SERVER = "medrxiv"
+    # Fetch papers from last N days
+    DEFAULT_DAYS = 90
+    def __init__(self, server: str = DEFAULT_SERVER, days: int = DEFAULT_DAYS) -> None:
+        """
+        Initialize bioRxiv tool.
+        Args:
+            server: "biorxiv" or "medrxiv"
+            days: How many days back to search
+        """
+        self.server = server
+        self.days = days
+    @property
+    def name(self) -> str:
+        return "biorxiv"
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        reraise=True,
+    )
+    async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
+        """
+        Search bioRxiv/medRxiv for preprints matching query.
+        Note: bioRxiv API doesn't support keyword search directly.
+        We fetch recent papers and filter client-side.
+        Args:
+            query: Search query (keywords)
+            max_results: Maximum results to return
+        Returns:
+            List of Evidence objects from preprints
+        """
+        # Build date range for last N days
+        end_date = datetime.now().strftime("%Y-%m-%d")
+        start_date = (datetime.now() - timedelta(days=self.days)).strftime("%Y-%m-%d")
+        interval = f"{start_date}/{end_date}"
+        # Fetch recent papers
+        url = f"{self.BASE_URL}/{self.server}/{interval}/0/json"
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            try:
+                response = await client.get(url)
+                response.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                raise SearchError(f"bioRxiv search failed: {e}") from e
+            except httpx.RequestError as e:
+                raise SearchError(f"bioRxiv connection failed: {e}") from e
+            data = response.json()
+            papers = data.get("collection", [])
+            # Filter papers by query keywords
+            query_terms = self._extract_terms(query)
+            matching = self._filter_by_keywords(papers, query_terms, max_results)
+            return [self._paper_to_evidence(paper) for paper in matching]
+    def _extract_terms(self, query: str) -> list[str]:
+        """Extract search terms from query."""
+        # Simple tokenization, lowercase
+        terms = re.findall(r"\b\w+\b", query.lower())
+        # Filter out common stop words
+        stop_words = {"the", "a", "an", "in", "on", "for", "and", "or", "of", "to"}
+        return [t for t in terms if t not in stop_words and len(t) > 2]
+    def _filter_by_keywords(
+        self, papers: list[dict[str, Any]], terms: list[str], max_results: int
+    ) -> list[dict[str, Any]]:
+        """Filter papers that contain query terms in title or abstract."""
+        scored_papers = []
+        for paper in papers:
+            title = paper.get("title", "").lower()
+            abstract = paper.get("abstract", "").lower()
+            text = f"{title} {abstract}"
+            # Count matching terms
+            matches = sum(1 for term in terms if term in text)
+            if matches > 0:
+                scored_papers.append((matches, paper))
+        # Sort by match count (descending)
+        scored_papers.sort(key=lambda x: x[0], reverse=True)
+        return [paper for _, paper in scored_papers[:max_results]]
+    def _paper_to_evidence(self, paper: dict[str, Any]) -> Evidence:
+        """Convert a preprint paper to Evidence."""
+        doi = paper.get("doi", "")
+        title = paper.get("title", "Untitled")
+        authors_str = paper.get("authors", "Unknown")
+        date = paper.get("date", "Unknown")
+        abstract = paper.get("abstract", "No abstract available.")
+        category = paper.get("category", "")
+        # Parse authors (format: "Smith, J; Jones, A")
+        authors = [a.strip() for a in authors_str.split(";")][:5]
+        # Truncate abstract if needed
+        truncated_abstract = abstract[:1800]
+        suffix = "..." if len(abstract) > 1800 else ""
+        # Note this is a preprint in the content
+        content = (
+            f"[PREPRINT - Not peer-reviewed] {truncated_abstract}{suffix} Category: {category}."
+        )
+        return Evidence(
+            content=content[:2000],
+            citation=Citation(
+                source="biorxiv",
+                title=title[:500],
+                url=f"https://doi.org/{doi}" if doi else "https://www.medrxiv.org/",
+                date=date,
+                authors=authors,
+            ),
+            relevance=0.75,  # Slightly lower than peer-reviewed
+        )

src/utils/models.py CHANGED Viewed

@@ -6,13 +6,14 @@ from typing import Any, ClassVar, Literal
 from pydantic import BaseModel, Field
 # Centralized source type - add new sources here (e.g., "biorxiv" in Phase 11)
-SourceName = Literal["pubmed", "clinicaltrials"]
 class Citation(BaseModel):
     """A citation to a source document."""
     source: SourceName = Field(description="Where this came from")
     title: str = Field(min_length=1, max_length=500)
     url: str = Field(description="URL to the source")
     date: str = Field(description="Publication date (YYYY-MM-DD or 'Unknown')")

 from pydantic import BaseModel, Field
 # Centralized source type - add new sources here (e.g., "biorxiv" in Phase 11)
+SourceName = Literal["pubmed", "clinicaltrials", "biorxiv"]
 class Citation(BaseModel):
     """A citation to a source document."""
     source: SourceName = Field(description="Where this came from")
     title: str = Field(min_length=1, max_length=500)
     url: str = Field(description="URL to the source")
     date: str = Field(description="Publication date (YYYY-MM-DD or 'Unknown')")

tests/unit/tools/test_biorxiv.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""Unit tests for bioRxiv tool."""
+import pytest
+import respx
+from httpx import Response
+from src.tools.biorxiv import BioRxivTool
+from src.utils.models import Evidence
+@pytest.fixture
+def mock_biorxiv_response():
+    """Mock bioRxiv API response."""
+    return {
+        "collection": [
+            {
+                "doi": "10.1101/2024.01.15.24301234",
+                "title": "Metformin repurposing for Alzheimer's disease: a systematic review",
+                "authors": "Smith, John; Jones, Alice; Brown, Bob",
+                "date": "2024-01-15",
+                "category": "neurology",
+                "abstract": "Background: Metformin has shown neuroprotective effects. "
+                "We conducted a systematic review of metformin's potential "
+                "for Alzheimer's disease treatment.",
+            },
+            {
+                "doi": "10.1101/2024.01.10.24301111",
+                "title": "COVID-19 vaccine efficacy study",
+                "authors": "Wilson, C",
+                "date": "2024-01-10",
+                "category": "infectious diseases",
+                "abstract": "This study evaluates COVID-19 vaccine efficacy.",
+            },
+        ],
+        "messages": [{"status": "ok", "count": 2}],
+    }
+class TestBioRxivTool:
+    """Tests for BioRxivTool."""
+    def test_tool_name(self):
+        """Tool should have correct name."""
+        tool = BioRxivTool()
+        assert tool.name == "biorxiv"
+    def test_default_server_is_medrxiv(self):
+        """Default server should be medRxiv for medical relevance."""
+        tool = BioRxivTool()
+        assert tool.server == "medrxiv"
+    @pytest.mark.asyncio
+    @respx.mock
+    async def test_search_returns_evidence(self, mock_biorxiv_response):
+        """Search should return Evidence objects."""
+        respx.get(url__startswith="https://api.biorxiv.org/details").mock(
+            return_value=Response(200, json=mock_biorxiv_response)
+        )
+        tool = BioRxivTool()
+        results = await tool.search("metformin alzheimer", max_results=5)
+        assert len(results) == 1  # Only the matching paper
+        assert isinstance(results[0], Evidence)
+        assert results[0].citation.source == "biorxiv"
+        assert "metformin" in results[0].citation.title.lower()
+    @pytest.mark.asyncio
+    @respx.mock
+    async def test_search_filters_by_keywords(self, mock_biorxiv_response):
+        """Search should filter papers by query keywords."""
+        respx.get(url__startswith="https://api.biorxiv.org/details").mock(
+            return_value=Response(200, json=mock_biorxiv_response)
+        )
+        tool = BioRxivTool()
+        # Search for metformin - should match first paper
+        results = await tool.search("metformin")
+        assert len(results) == 1
+        assert "metformin" in results[0].citation.title.lower()
+        # Search for COVID - should match second paper
+        results = await tool.search("covid vaccine")
+        assert len(results) == 1
+        assert "covid" in results[0].citation.title.lower()
+    @pytest.mark.asyncio
+    @respx.mock
+    async def test_search_marks_as_preprint(self, mock_biorxiv_response):
+        """Evidence content should note it's a preprint."""
+        respx.get(url__startswith="https://api.biorxiv.org/details").mock(
+            return_value=Response(200, json=mock_biorxiv_response)
+        )
+        tool = BioRxivTool()
+        results = await tool.search("metformin")
+        assert "PREPRINT" in results[0].content
+        assert "Not peer-reviewed" in results[0].content
+    @pytest.mark.asyncio
+    @respx.mock
+    async def test_search_empty_results(self):
+        """Search should handle empty results gracefully."""
+        respx.get(url__startswith="https://api.biorxiv.org/details").mock(
+            return_value=Response(200, json={"collection": [], "messages": []})
+        )
+        tool = BioRxivTool()
+        results = await tool.search("xyznonexistent")
+        assert results == []
+    @pytest.mark.asyncio
+    @respx.mock
+    async def test_search_api_error(self):
+        """Search should raise SearchError on API failure."""
+        from src.utils.exceptions import SearchError
+        respx.get(url__startswith="https://api.biorxiv.org/details").mock(
+            return_value=Response(500, text="Internal Server Error")
+        )
+        tool = BioRxivTool()
+        with pytest.raises(SearchError):
+            await tool.search("metformin")
+    @pytest.mark.asyncio
+    @respx.mock
+    async def test_search_network_error(self):
+        """Search should raise SearchError on network failure."""
+        import httpx
+        from src.utils.exceptions import SearchError
+        respx.get(url__startswith="https://api.biorxiv.org/details").mock(
+            side_effect=httpx.RequestError("Network connection failed")
+        )
+        tool = BioRxivTool()
+        with pytest.raises(SearchError) as exc_info:
+            await tool.search("metformin")
+        assert "connection failed" in str(exc_info.value)
+    def test_extract_terms(self):
+        """Should extract meaningful search terms."""
+        tool = BioRxivTool()
+        terms = tool._extract_terms("metformin for Alzheimer's disease")
+        assert "metformin" in terms
+        assert "alzheimer" in terms
+        assert "disease" in terms
+        assert "for" not in terms  # Stop word
+        assert "the" not in terms  # Stop word
+class TestBioRxivIntegration:
+    """Integration tests (marked for separate run)."""
+    @pytest.mark.integration
+    @pytest.mark.asyncio
+    async def test_real_api_call(self):
+        """Test actual API call (requires network)."""
+        tool = BioRxivTool(days=30)  # Last 30 days
+        results = await tool.search("diabetes", max_results=3)
+        # May or may not find results depending on recent papers
+        # But we want to ensure the code runs without crashing
+        assert isinstance(results, list)
+        if results:
+            r = results[0]
+            assert isinstance(r, Evidence)
+            assert r.citation.source == "biorxiv"