Spaces:

DataQuests
/

DeepCritical

Running

VibecoderMcSwaggins commited on 12 days ago

Commit

e502f0d

1 Parent(s): 2f8ae1f

feat(search): implement PubMed query preprocessing (Phase 02)

Added query preprocessing to strip question words and expand medical synonyms (e.g., 'Long COVID' -> 'PASC'). This fixes poor search results for natural language queries.

Files changed (4) hide show

src/tools/pubmed.py +6 -1
src/tools/query_utils.py +161 -0
tests/unit/tools/test_pubmed.py +28 -0
tests/unit/tools/test_query_utils.py +71 -0

src/tools/pubmed.py CHANGED Viewed

@@ -7,6 +7,7 @@ import httpx
 import xmltodict
 from tenacity import retry, stop_after_attempt, wait_exponential
 from src.utils.config import settings
 from src.utils.exceptions import RateLimitError, SearchError
 from src.utils.models import Citation, Evidence
@@ -61,11 +62,15 @@ class PubMedTool:
         """
         await self._rate_limit()
         async with httpx.AsyncClient(timeout=30.0) as client:
             # Step 1: Search for PMIDs
             search_params = self._build_params(
                 db="pubmed",
-                term=query,
                 retmax=max_results,
                 sort="relevance",
             )

 import xmltodict
 from tenacity import retry, stop_after_attempt, wait_exponential
+from src.tools.query_utils import preprocess_query
 from src.utils.config import settings
 from src.utils.exceptions import RateLimitError, SearchError
 from src.utils.models import Citation, Evidence
         """
         await self._rate_limit()
+        # Preprocess query to remove noise and expand synonyms
+        clean_query = preprocess_query(query)
+        final_query = clean_query if clean_query else query
         async with httpx.AsyncClient(timeout=30.0) as client:
             # Step 1: Search for PMIDs
             search_params = self._build_params(
                 db="pubmed",
+                term=final_query,
                 retmax=max_results,
                 sort="relevance",
             )

src/tools/query_utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""Query preprocessing utilities for biomedical search."""
+import re
+# Question words and filler words to remove
+QUESTION_WORDS: set[str] = {
+    # Question starters
+    "what",
+    "which",
+    "how",
+    "why",
+    "when",
+    "where",
+    "who",
+    "whom",
+    # Auxiliary verbs in questions
+    "is",
+    "are",
+    "was",
+    "were",
+    "do",
+    "does",
+    "did",
+    "can",
+    "could",
+    "would",
+    "should",
+    "will",
+    "shall",
+    "may",
+    "might",
+    # Filler words in natural questions
+    "show",
+    "promise",
+    "help",
+    "believe",
+    "think",
+    "suggest",
+    "possible",
+    "potential",
+    "effective",
+    "useful",
+    "good",
+    # Articles (remove but less aggressively)
+    "the",
+    "a",
+    "an",
+}
+# Medical synonym expansions
+SYNONYMS: dict[str, list[str]] = {
+    "long covid": [
+        "long COVID",
+        "PASC",
+        "post-acute sequelae of SARS-CoV-2",
+        "post-COVID syndrome",
+        "post-COVID-19 condition",
+    ],
+    "alzheimer": [
+        "Alzheimer's disease",
+        "Alzheimer disease",
+        "AD",
+        "Alzheimer dementia",
+    ],
+    "parkinson": [
+        "Parkinson's disease",
+        "Parkinson disease",
+        "PD",
+    ],
+    "diabetes": [
+        "diabetes mellitus",
+        "type 2 diabetes",
+        "T2DM",
+        "diabetic",
+    ],
+    "cancer": [
+        "cancer",
+        "neoplasm",
+        "tumor",
+        "malignancy",
+        "carcinoma",
+    ],
+    "heart disease": [
+        "cardiovascular disease",
+        "CVD",
+        "coronary artery disease",
+        "heart failure",
+    ],
+}
+def strip_question_words(query: str) -> str:
+    """
+    Remove question words and filler terms from query.
+    Args:
+        query: Raw query string
+    Returns:
+        Query with question words removed
+    """
+    words = query.lower().split()
+    filtered = [w for w in words if w not in QUESTION_WORDS]
+    return " ".join(filtered)
+def expand_synonyms(query: str) -> str:
+    """
+    Expand medical terms to include synonyms.
+    Args:
+        query: Query string
+    Returns:
+        Query with synonym expansions in OR groups
+    """
+    result = query.lower()
+    for term, expansions in SYNONYMS.items():
+        if term in result:
+            # Create OR group: ("term1" OR "term2" OR "term3")
+            or_group = " OR ".join([f'"{exp}"' for exp in expansions])
+            # Case insensitive replacement is tricky with simple replace
+            # But we lowercased result already.
+            # However, this replaces ALL instances.
+            # Also, result is lowercased, so we lose original casing if any.
+            # But search engines are usually case-insensitive.
+            result = result.replace(term, f"({or_group})")
+    return result
+def preprocess_query(raw_query: str) -> str:
+    """
+    Full preprocessing pipeline for PubMed queries.
+    Pipeline:
+    1. Strip whitespace and punctuation
+    2. Remove question words
+    3. Expand medical synonyms
+    Args:
+        raw_query: Natural language query from user
+    Returns:
+        Optimized query for PubMed
+    """
+    if not raw_query or not raw_query.strip():
+        return ""
+    # Remove question marks and extra whitespace
+    query = raw_query.replace("?", "").strip()
+    query = re.sub(r"\s+", " ", query)
+    # Strip question words
+    query = strip_question_words(query)
+    # Expand synonyms
+    query = expand_synonyms(query)
+    return query.strip()

tests/unit/tools/test_pubmed.py CHANGED Viewed

@@ -97,3 +97,31 @@ class TestPubMedTool:
         assert len(results) == 1
         assert results[0].citation.source == "pubmed"
         assert "Smith John" in results[0].citation.authors

         assert len(results) == 1
         assert results[0].citation.source == "pubmed"
         assert "Smith John" in results[0].citation.authors
+    @pytest.mark.asyncio
+    async def test_search_preprocesses_query(self, mocker):
+        """Test that queries are preprocessed before search."""
+        mock_search_response = MagicMock()
+        mock_search_response.json.return_value = {"esearchresult": {"idlist": []}}
+        mock_search_response.raise_for_status = MagicMock()
+        mock_client = AsyncMock()
+        mock_client.get = AsyncMock(return_value=mock_search_response)
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=None)
+        mocker.patch("httpx.AsyncClient", return_value=mock_client)
+        tool = PubMedTool()
+        await tool.search("What drugs help with Long COVID?")
+        # Verify call args
+        call_args = mock_client.get.call_args
+        params = call_args[1]["params"]
+        term = params["term"]
+        # "what" and "help" should be stripped
+        assert "what" not in term.lower()
+        assert "help" not in term.lower()
+        # "long covid" should be expanded
+        assert "PASC" in term or "post-COVID" in term

tests/unit/tools/test_query_utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Unit tests for query preprocessing utilities."""
+import pytest
+from src.tools.query_utils import expand_synonyms, preprocess_query, strip_question_words
+@pytest.mark.unit
+class TestQueryPreprocessing:
+    """Tests for query preprocessing."""
+    def test_strip_question_words(self):
+        """Test removal of question words."""
+        assert strip_question_words("What drugs treat cancer") == "drugs treat cancer"
+        assert strip_question_words("Which medications help diabetes") == "medications diabetes"
+        assert strip_question_words("How can we cure alzheimer") == "we cure alzheimer"
+        assert strip_question_words("Is metformin effective") == "metformin"
+    def test_strip_preserves_medical_terms(self):
+        """Test that medical terms are preserved."""
+        result = strip_question_words("What is the mechanism of metformin")
+        assert "metformin" in result
+        assert "mechanism" in result
+    def test_expand_synonyms_long_covid(self):
+        """Test Long COVID synonym expansion."""
+        result = expand_synonyms("long covid treatment")
+        assert "PASC" in result or "post-COVID" in result
+    def test_expand_synonyms_alzheimer(self):
+        """Test Alzheimer's synonym expansion."""
+        result = expand_synonyms("alzheimer drug")
+        assert "Alzheimer" in result
+    def test_expand_synonyms_preserves_unknown(self):
+        """Test that unknown terms are preserved."""
+        result = expand_synonyms("metformin diabetes")
+        assert "metformin" in result
+        assert "diabetes" in result
+    def test_preprocess_query_full_pipeline(self):
+        """Test complete preprocessing pipeline."""
+        raw = "What medications show promise for Long COVID?"
+        result = preprocess_query(raw)
+        # Should not contain question words
+        assert "what" not in result.lower()
+        assert "show" not in result.lower()
+        assert "promise" not in result.lower()
+        # Should contain expanded terms
+        assert "PASC" in result or "post-COVID" in result or "long covid" in result.lower()
+        assert "medications" in result.lower() or "drug" in result.lower()
+    def test_preprocess_query_removes_punctuation(self):
+        """Test that question marks are removed."""
+        result = preprocess_query("Is metformin safe?")
+        assert "?" not in result
+    def test_preprocess_query_handles_empty(self):
+        """Test handling of empty/whitespace queries."""
+        assert preprocess_query("") == ""
+        assert preprocess_query("   ") == ""
+    def test_preprocess_query_already_clean(self):
+        """Test that clean queries pass through."""
+        clean = "metformin diabetes mechanism"
+        result = preprocess_query(clean)
+        assert "metformin" in result
+        assert "diabetes" in result
+        assert "mechanism" in result