Joseph Pollack commited on
Commit
66c7f79
·
unverified ·
2 Parent(s): 687a1f1 e6c2142

Merge branch 'feature/iterative-deep-research-workflows' of https://github.com/Josephrp/DeepCritical-HFSpace into feature/iterative-deep-research-workflows

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +12 -8
  2. .gitignore +5 -0
  3. docs/brainstorming/00_ROADMAP_SUMMARY.md +194 -0
  4. docs/brainstorming/01_PUBMED_IMPROVEMENTS.md +125 -0
  5. docs/brainstorming/02_CLINICALTRIALS_IMPROVEMENTS.md +193 -0
  6. docs/brainstorming/03_EUROPEPMC_IMPROVEMENTS.md +211 -0
  7. docs/brainstorming/04_OPENALEX_INTEGRATION.md +303 -0
  8. docs/brainstorming/implementation/15_PHASE_OPENALEX.md +603 -0
  9. docs/brainstorming/implementation/16_PHASE_PUBMED_FULLTEXT.md +586 -0
  10. docs/brainstorming/implementation/17_PHASE_RATE_LIMITING.md +540 -0
  11. docs/brainstorming/implementation/README.md +143 -0
  12. docs/brainstorming/magentic-pydantic/00_SITUATION_AND_PLAN.md +189 -0
  13. docs/brainstorming/magentic-pydantic/01_ARCHITECTURE_SPEC.md +289 -0
  14. docs/brainstorming/magentic-pydantic/02_IMPLEMENTATION_PHASES.md +112 -0
  15. docs/brainstorming/magentic-pydantic/03_IMMEDIATE_ACTIONS.md +112 -0
  16. docs/brainstorming/magentic-pydantic/04_FOLLOWUP_REVIEW_REQUEST.md +158 -0
  17. docs/brainstorming/magentic-pydantic/REVIEW_PROMPT_FOR_SENIOR_AGENT.md +113 -0
  18. docs/bugs/FIX_PLAN_MAGENTIC_MODE.md +227 -0
  19. docs/bugs/P0_ACTIONABLE_FIXES.md +0 -281
  20. docs/bugs/P0_CRITICAL_BUGS.md +0 -298
  21. docs/bugs/P0_MAGENTIC_AND_SEARCH_AUDIT.md +0 -249
  22. docs/bugs/P0_MAGENTIC_MODE_BROKEN.md +116 -0
  23. docs/bugs/P1_GRADIO_SETTINGS_CLEANUP.md +81 -0
  24. docs/bugs/PHASE_00_IMPLEMENTATION_ORDER.md +0 -156
  25. docs/bugs/PHASE_01_REPLACE_BIORXIV.md +0 -371
  26. docs/bugs/PHASE_02_PUBMED_QUERY_PREPROCESSING.md +0 -355
  27. docs/bugs/PHASE_03_CLINICALTRIALS_FILTERING.md +0 -386
  28. examples/rate_limiting_demo.py +82 -0
  29. pyproject.toml +3 -1
  30. requirements.txt +7 -0
  31. src/agent_factory/judges.py +9 -1
  32. src/agents/code_executor_agent.py +69 -0
  33. src/agents/judge_agent_llm.py +45 -0
  34. src/agents/magentic_agents.py +1 -1
  35. src/agents/retrieval_agent.py +82 -0
  36. src/app.py +60 -69
  37. src/middleware/sub_iteration.py +135 -0
  38. src/orchestrator_factory.py +40 -15
  39. src/orchestrator_hierarchical.py +95 -0
  40. src/orchestrator_magentic.py +35 -4
  41. src/state/__init__.py +9 -0
  42. src/tools/__init__.py +2 -0
  43. src/tools/pubmed.py +5 -9
  44. src/tools/rate_limiter.py +121 -0
  45. src/tools/web_search.py +53 -0
  46. src/utils/config.py +14 -2
  47. src/utils/models.py +4 -0
  48. tests/integration/test_dual_mode_e2e.py +82 -0
  49. tests/integration/test_modal.py +11 -2
  50. tests/unit/agent_factory/test_judges_factory.py +64 -0
.env.example CHANGED
@@ -7,9 +7,17 @@ LLM_PROVIDER=openai
7
  OPENAI_API_KEY=sk-your-key-here
8
  ANTHROPIC_API_KEY=sk-ant-your-key-here
9
 
10
- # Model names (optional - sensible defaults)
11
- OPENAI_MODEL=gpt-5.1
12
- ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
 
 
 
 
 
 
 
 
13
 
14
  # ============== HUGGINGFACE (FREE TIER) ==============
15
 
@@ -20,7 +28,7 @@ ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
20
  # WITH HF_TOKEN: Uses Llama 3.1 8B Instruct (requires accepting license)
21
  #
22
  # For HuggingFace Spaces deployment:
23
- # Set this as a "Secret" in Space Settings Variables and secrets
24
  # Users/judges don't need their own token - the Space secret is used
25
  #
26
  HF_TOKEN=hf_your-token-here
@@ -36,9 +44,5 @@ LOG_LEVEL=INFO
36
  # PubMed (optional - higher rate limits)
37
  NCBI_API_KEY=your-ncbi-key-here
38
 
39
- # Modal Sandbox (optional - for secure code execution)
40
- MODAL_TOKEN_ID=ak-your-modal-token-id-here
41
- MODAL_TOKEN_SECRET=your-modal-token-secret-here
42
-
43
  # Vector Database (optional - for LlamaIndex RAG)
44
  CHROMA_DB_PATH=./chroma_db
 
7
  OPENAI_API_KEY=sk-your-key-here
8
  ANTHROPIC_API_KEY=sk-ant-your-key-here
9
 
10
+ # Model names (optional - sensible defaults set in config.py)
11
+ # ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
12
+ # OPENAI_MODEL=gpt-5.1
13
+
14
+ # ============== EMBEDDINGS ==============
15
+
16
+ # OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
17
+ OPENAI_EMBEDDING_MODEL=text-embedding-3-small
18
+
19
+ # Local Embedding Model (used for local/offline embeddings)
20
+ LOCAL_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
21
 
22
  # ============== HUGGINGFACE (FREE TIER) ==============
23
 
 
28
  # WITH HF_TOKEN: Uses Llama 3.1 8B Instruct (requires accepting license)
29
  #
30
  # For HuggingFace Spaces deployment:
31
+ # Set this as a "Secret" in Space Settings -> Variables and secrets
32
  # Users/judges don't need their own token - the Space secret is used
33
  #
34
  HF_TOKEN=hf_your-token-here
 
44
  # PubMed (optional - higher rate limits)
45
  NCBI_API_KEY=your-ncbi-key-here
46
 
 
 
 
 
47
  # Vector Database (optional - for LlamaIndex RAG)
48
  CHROMA_DB_PATH=./chroma_db
.gitignore CHANGED
@@ -69,4 +69,9 @@ logs/
69
  .mypy_cache/
70
  .coverage
71
  htmlcov/
 
 
 
 
 
72
  # Trigger rebuild Wed Nov 26 17:51:41 EST 2025
 
69
  .mypy_cache/
70
  .coverage
71
  htmlcov/
72
+
73
+ # Database files
74
+ chroma_db/
75
+ *.sqlite3
76
+
77
  # Trigger rebuild Wed Nov 26 17:51:41 EST 2025
docs/brainstorming/00_ROADMAP_SUMMARY.md ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DeepCritical Data Sources: Roadmap Summary
2
+
3
+ **Created**: 2024-11-27
4
+ **Purpose**: Future maintainability and hackathon continuation
5
+
6
+ ---
7
+
8
+ ## Current State
9
+
10
+ ### Working Tools
11
+
12
+ | Tool | Status | Data Quality |
13
+ |------|--------|--------------|
14
+ | PubMed | ✅ Works | Good (abstracts only) |
15
+ | ClinicalTrials.gov | ✅ Works | Good (filtered for interventional) |
16
+ | Europe PMC | ✅ Works | Good (includes preprints) |
17
+
18
+ ### Removed Tools
19
+
20
+ | Tool | Status | Reason |
21
+ |------|--------|--------|
22
+ | bioRxiv | ❌ Removed | No search API - only date/DOI lookup |
23
+
24
+ ---
25
+
26
+ ## Priority Improvements
27
+
28
+ ### P0: Critical (Do First)
29
+
30
+ 1. **Add Rate Limiting to PubMed**
31
+ - NCBI will block us without it
32
+ - Use `limits` library (see reference repo)
33
+ - 3/sec without key, 10/sec with key
34
+
35
+ ### P1: High Value, Medium Effort
36
+
37
+ 2. **Add OpenAlex as 4th Source**
38
+ - Citation network (huge for drug repurposing)
39
+ - Concept tagging (semantic discovery)
40
+ - Already implemented in reference repo
41
+ - Free, no API key
42
+
43
+ 3. **PubMed Full-Text via BioC**
44
+ - Get full paper text for PMC papers
45
+ - Already in reference repo
46
+
47
+ ### P2: Nice to Have
48
+
49
+ 4. **ClinicalTrials.gov Results**
50
+ - Get efficacy data from completed trials
51
+ - Requires more complex API calls
52
+
53
+ 5. **Europe PMC Annotations**
54
+ - Text-mined entities (genes, drugs, diseases)
55
+ - Automatic entity extraction
56
+
57
+ ---
58
+
59
+ ## Effort Estimates
60
+
61
+ | Improvement | Effort | Impact | Priority |
62
+ |-------------|--------|--------|----------|
63
+ | PubMed rate limiting | 1 hour | Stability | P0 |
64
+ | OpenAlex basic search | 2 hours | High | P1 |
65
+ | OpenAlex citations | 2 hours | Very High | P1 |
66
+ | PubMed full-text | 3 hours | Medium | P1 |
67
+ | CT.gov results | 4 hours | Medium | P2 |
68
+ | Europe PMC annotations | 3 hours | Medium | P2 |
69
+
70
+ ---
71
+
72
+ ## Architecture Decision
73
+
74
+ ### Option A: Keep Current + Add OpenAlex
75
+
76
+ ```
77
+ User Query
78
+
79
+ ┌───────────────────┼───────────────────┐
80
+ ↓ ↓ ↓
81
+ PubMed ClinicalTrials Europe PMC
82
+ (abstracts) (trials only) (preprints)
83
+ ↓ ↓ ↓
84
+ └───────────────────┼───────────────────┘
85
+
86
+ OpenAlex ← NEW
87
+ (citations, concepts)
88
+
89
+ Orchestrator
90
+
91
+ Report
92
+ ```
93
+
94
+ **Pros**: Low risk, additive
95
+ **Cons**: More complexity, some overlap
96
+
97
+ ### Option B: OpenAlex as Primary
98
+
99
+ ```
100
+ User Query
101
+
102
+ ┌───────────────────┼───────────────────┐
103
+ ↓ ↓ ↓
104
+ OpenAlex ClinicalTrials Europe PMC
105
+ (primary (trials only) (full-text
106
+ search) fallback)
107
+ ↓ ↓ ↓
108
+ └───────────────────┼───────────────────┘
109
+
110
+ Orchestrator
111
+
112
+ Report
113
+ ```
114
+
115
+ **Pros**: Simpler, citation network built-in
116
+ **Cons**: Lose some PubMed-specific features
117
+
118
+ ### Recommendation: Option A
119
+
120
+ Keep current architecture working, add OpenAlex incrementally.
121
+
122
+ ---
123
+
124
+ ## Quick Wins (Can Do Today)
125
+
126
+ 1. **Add `limits` to `pyproject.toml`**
127
+ ```toml
128
+ dependencies = [
129
+ "limits>=3.0",
130
+ ]
131
+ ```
132
+
133
+ 2. **Copy OpenAlex tool from reference repo**
134
+ - File: `reference_repos/DeepCritical/DeepResearch/src/tools/openalex_tools.py`
135
+ - Adapt to our `SearchTool` base class
136
+
137
+ 3. **Enable NCBI API Key**
138
+ - Add to `.env`: `NCBI_API_KEY=your_key`
139
+ - 10x rate limit improvement
140
+
141
+ ---
142
+
143
+ ## External Resources Worth Exploring
144
+
145
+ ### Python Libraries
146
+
147
+ | Library | For | Notes |
148
+ |---------|-----|-------|
149
+ | `limits` | Rate limiting | Used by reference repo |
150
+ | `pyalex` | OpenAlex wrapper | [GitHub](https://github.com/J535D165/pyalex) |
151
+ | `metapub` | PubMed | Full-featured |
152
+ | `sentence-transformers` | Semantic search | For embeddings |
153
+
154
+ ### APIs Not Yet Used
155
+
156
+ | API | Provides | Effort |
157
+ |-----|----------|--------|
158
+ | RxNorm | Drug name normalization | Low |
159
+ | DrugBank | Drug targets/mechanisms | Medium (license) |
160
+ | UniProt | Protein data | Medium |
161
+ | ChEMBL | Bioactivity data | Medium |
162
+
163
+ ### RAG Tools (Future)
164
+
165
+ | Tool | Purpose |
166
+ |------|---------|
167
+ | [PaperQA](https://github.com/Future-House/paper-qa) | RAG for scientific papers |
168
+ | [txtai](https://github.com/neuml/txtai) | Embeddings + search |
169
+ | [PubMedBERT](https://huggingface.co/NeuML/pubmedbert-base-embeddings) | Biomedical embeddings |
170
+
171
+ ---
172
+
173
+ ## Files in This Directory
174
+
175
+ | File | Contents |
176
+ |------|----------|
177
+ | `00_ROADMAP_SUMMARY.md` | This file |
178
+ | `01_PUBMED_IMPROVEMENTS.md` | PubMed enhancement details |
179
+ | `02_CLINICALTRIALS_IMPROVEMENTS.md` | ClinicalTrials.gov details |
180
+ | `03_EUROPEPMC_IMPROVEMENTS.md` | Europe PMC details |
181
+ | `04_OPENALEX_INTEGRATION.md` | OpenAlex integration plan |
182
+
183
+ ---
184
+
185
+ ## For Future Maintainers
186
+
187
+ If you're picking this up after the hackathon:
188
+
189
+ 1. **Start with OpenAlex** - biggest bang for buck
190
+ 2. **Add rate limiting** - prevents API blocks
191
+ 3. **Don't bother with bioRxiv** - use Europe PMC instead
192
+ 4. **Reference repo is gold** - `reference_repos/DeepCritical/` has working implementations
193
+
194
+ Good luck! 🚀
docs/brainstorming/01_PUBMED_IMPROVEMENTS.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PubMed Tool: Current State & Future Improvements
2
+
3
+ **Status**: Currently Implemented
4
+ **Priority**: High (Core Data Source)
5
+
6
+ ---
7
+
8
+ ## Current Implementation
9
+
10
+ ### What We Have (`src/tools/pubmed.py`)
11
+
12
+ - Basic E-utilities search via `esearch.fcgi` and `efetch.fcgi`
13
+ - Query preprocessing (strips question words, expands synonyms)
14
+ - Returns: title, abstract, authors, journal, PMID
15
+ - Rate limiting: None implemented (relying on NCBI defaults)
16
+
17
+ ### Current Limitations
18
+
19
+ 1. **No Full-Text Access**: Only retrieves abstracts, not full paper text
20
+ 2. **No Rate Limiting**: Risk of being blocked by NCBI
21
+ 3. **No BioC Format**: Missing structured full-text extraction
22
+ 4. **No Figure Retrieval**: No supplementary materials access
23
+ 5. **No PMC Integration**: Missing open-access full-text via PMC
24
+
25
+ ---
26
+
27
+ ## Reference Implementation (DeepCritical Reference Repo)
28
+
29
+ The reference repo at `reference_repos/DeepCritical/DeepResearch/src/tools/bioinformatics_tools.py` has a more sophisticated implementation:
30
+
31
+ ### Features We're Missing
32
+
33
+ ```python
34
+ # Rate limiting (lines 47-50)
35
+ from limits import parse
36
+ from limits.storage import MemoryStorage
37
+ from limits.strategies import MovingWindowRateLimiter
38
+
39
+ storage = MemoryStorage()
40
+ limiter = MovingWindowRateLimiter(storage)
41
+ rate_limit = parse("3/second") # NCBI allows 3/sec without API key, 10/sec with
42
+
43
+ # Full-text via BioC format (lines 108-120)
44
+ def _get_fulltext(pmid: int) -> dict[str, Any] | None:
45
+ pmid_url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/unicode"
46
+ # Returns structured JSON with full text for open-access papers
47
+
48
+ # Figure retrieval via Europe PMC (lines 123-149)
49
+ def _get_figures(pmcid: str) -> dict[str, str]:
50
+ suppl_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/supplementaryFiles"
51
+ # Returns base64-encoded images from supplementary materials
52
+ ```
53
+
54
+ ---
55
+
56
+ ## Recommended Improvements
57
+
58
+ ### Phase 1: Rate Limiting (Critical)
59
+
60
+ ```python
61
+ # Add to src/tools/pubmed.py
62
+ from limits import parse
63
+ from limits.storage import MemoryStorage
64
+ from limits.strategies import MovingWindowRateLimiter
65
+
66
+ storage = MemoryStorage()
67
+ limiter = MovingWindowRateLimiter(storage)
68
+
69
+ # With NCBI_API_KEY: 10/sec, without: 3/sec
70
+ def get_rate_limit():
71
+ if settings.ncbi_api_key:
72
+ return parse("10/second")
73
+ return parse("3/second")
74
+ ```
75
+
76
+ **Dependencies**: `pip install limits`
77
+
78
+ ### Phase 2: Full-Text Retrieval
79
+
80
+ ```python
81
+ async def get_fulltext(pmid: str) -> str | None:
82
+ """Get full text for open-access papers via BioC API."""
83
+ url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/unicode"
84
+ # Only works for PMC papers (open access)
85
+ ```
86
+
87
+ ### Phase 3: PMC ID Resolution
88
+
89
+ ```python
90
+ async def get_pmc_id(pmid: str) -> str | None:
91
+ """Convert PMID to PMCID for full-text access."""
92
+ url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={pmid}&format=json"
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Python Libraries to Consider
98
+
99
+ | Library | Purpose | Notes |
100
+ |---------|---------|-------|
101
+ | [Biopython](https://biopython.org/) | `Bio.Entrez` module | Official, well-maintained |
102
+ | [PyMed](https://pypi.org/project/pymed/) | PubMed wrapper | Simpler API, less control |
103
+ | [metapub](https://pypi.org/project/metapub/) | Full-featured | Tested on 1/3 of PubMed |
104
+ | [limits](https://pypi.org/project/limits/) | Rate limiting | Used by reference repo |
105
+
106
+ ---
107
+
108
+ ## API Endpoints Reference
109
+
110
+ | Endpoint | Purpose | Rate Limit |
111
+ |----------|---------|------------|
112
+ | `esearch.fcgi` | Search for PMIDs | 3/sec (10 with key) |
113
+ | `efetch.fcgi` | Fetch metadata | 3/sec (10 with key) |
114
+ | `esummary.fcgi` | Quick metadata | 3/sec (10 with key) |
115
+ | `pmcoa.cgi/BioC_json` | Full text (PMC only) | Unknown |
116
+ | `idconv/v1.0` | PMID ↔ PMCID | Unknown |
117
+
118
+ ---
119
+
120
+ ## Sources
121
+
122
+ - [PubMed E-utilities Documentation](https://www.ncbi.nlm.nih.gov/books/NBK25501/)
123
+ - [NCBI BioC API](https://www.ncbi.nlm.nih.gov/research/bionlp/APIs/)
124
+ - [Searching PubMed with Python](https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/)
125
+ - [PyMed on PyPI](https://pypi.org/project/pymed/)
docs/brainstorming/02_CLINICALTRIALS_IMPROVEMENTS.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ClinicalTrials.gov Tool: Current State & Future Improvements
2
+
3
+ **Status**: Currently Implemented
4
+ **Priority**: High (Core Data Source for Drug Repurposing)
5
+
6
+ ---
7
+
8
+ ## Current Implementation
9
+
10
+ ### What We Have (`src/tools/clinicaltrials.py`)
11
+
12
+ - V2 API search via `clinicaltrials.gov/api/v2/studies`
13
+ - Filters: `INTERVENTIONAL` study type, `RECRUITING` status
14
+ - Returns: NCT ID, title, conditions, interventions, phase, status
15
+ - Query preprocessing via shared `query_utils.py`
16
+
17
+ ### Current Strengths
18
+
19
+ 1. **Good Filtering**: Already filtering for interventional + recruiting
20
+ 2. **V2 API**: Using the modern API (v1 deprecated)
21
+ 3. **Phase Info**: Extracting trial phases for drug development context
22
+
23
+ ### Current Limitations
24
+
25
+ 1. **No Outcome Data**: Missing primary/secondary outcomes
26
+ 2. **No Eligibility Criteria**: Missing inclusion/exclusion details
27
+ 3. **No Sponsor Info**: Missing who's running the trial
28
+ 4. **No Result Data**: For completed trials, no efficacy data
29
+ 5. **Limited Drug Mapping**: No integration with drug databases
30
+
31
+ ---
32
+
33
+ ## API Capabilities We're Not Using
34
+
35
+ ### Fields We Could Request
36
+
37
+ ```python
38
+ # Current fields
39
+ fields = ["NCTId", "BriefTitle", "Condition", "InterventionName", "Phase", "OverallStatus"]
40
+
41
+ # Additional valuable fields
42
+ additional_fields = [
43
+ "PrimaryOutcomeMeasure", # What are they measuring?
44
+ "SecondaryOutcomeMeasure", # Secondary endpoints
45
+ "EligibilityCriteria", # Who can participate?
46
+ "LeadSponsorName", # Who's funding?
47
+ "ResultsFirstPostDate", # Has results?
48
+ "StudyFirstPostDate", # When started?
49
+ "CompletionDate", # When finished?
50
+ "EnrollmentCount", # Sample size
51
+ "InterventionDescription", # Drug details
52
+ "ArmGroupLabel", # Treatment arms
53
+ "InterventionOtherName", # Drug aliases
54
+ ]
55
+ ```
56
+
57
+ ### Filter Enhancements
58
+
59
+ ```python
60
+ # Current
61
+ aggFilters = "studyType:INTERVENTIONAL,status:RECRUITING"
62
+
63
+ # Could add
64
+ "status:RECRUITING,ACTIVE_NOT_RECRUITING,COMPLETED" # Include completed for results
65
+ "phase:PHASE2,PHASE3" # Only later-stage trials
66
+ "resultsFirstPostDateRange:2020-01-01_" # Trials with posted results
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Recommended Improvements
72
+
73
+ ### Phase 1: Richer Metadata
74
+
75
+ ```python
76
+ EXTENDED_FIELDS = [
77
+ "NCTId",
78
+ "BriefTitle",
79
+ "OfficialTitle",
80
+ "Condition",
81
+ "InterventionName",
82
+ "InterventionDescription",
83
+ "InterventionOtherName", # Drug synonyms!
84
+ "Phase",
85
+ "OverallStatus",
86
+ "PrimaryOutcomeMeasure",
87
+ "EnrollmentCount",
88
+ "LeadSponsorName",
89
+ "StudyFirstPostDate",
90
+ ]
91
+ ```
92
+
93
+ ### Phase 2: Results Retrieval
94
+
95
+ For completed trials, we can get actual efficacy data:
96
+
97
+ ```python
98
+ async def get_trial_results(nct_id: str) -> dict | None:
99
+ """Fetch results for completed trials."""
100
+ url = f"https://clinicaltrials.gov/api/v2/studies/{nct_id}"
101
+ params = {
102
+ "fields": "ResultsSection",
103
+ }
104
+ # Returns outcome measures and statistics
105
+ ```
106
+
107
+ ### Phase 3: Drug Name Normalization
108
+
109
+ Map intervention names to standard identifiers:
110
+
111
+ ```python
112
+ # Problem: "Metformin", "Metformin HCl", "Glucophage" are the same drug
113
+ # Solution: Use RxNorm or DrugBank for normalization
114
+
115
+ async def normalize_drug_name(intervention: str) -> str:
116
+ """Normalize drug name via RxNorm API."""
117
+ url = f"https://rxnav.nlm.nih.gov/REST/rxcui.json?name={intervention}"
118
+ # Returns standardized RxCUI
119
+ ```
120
+
121
+ ---
122
+
123
+ ## Integration Opportunities
124
+
125
+ ### With PubMed
126
+
127
+ Cross-reference trials with publications:
128
+ ```python
129
+ # ClinicalTrials.gov provides PMID links
130
+ # Can correlate trial results with published papers
131
+ ```
132
+
133
+ ### With DrugBank/ChEMBL
134
+
135
+ Map interventions to:
136
+ - Mechanism of action
137
+ - Known targets
138
+ - Adverse effects
139
+ - Drug-drug interactions
140
+
141
+ ---
142
+
143
+ ## Python Libraries to Consider
144
+
145
+ | Library | Purpose | Notes |
146
+ |---------|---------|-------|
147
+ | [pytrials](https://pypi.org/project/pytrials/) | CT.gov wrapper | V2 API support unclear |
148
+ | [clinicaltrials](https://github.com/ebmdatalab/clinicaltrials-act-tracker) | Data tracking | More for analysis |
149
+ | [drugbank-downloader](https://pypi.org/project/drugbank-downloader/) | Drug mapping | Requires license |
150
+
151
+ ---
152
+
153
+ ## API Quirks & Gotchas
154
+
155
+ 1. **Rate Limiting**: Undocumented, be conservative
156
+ 2. **Pagination**: Max 1000 results per request
157
+ 3. **Field Names**: Case-sensitive, camelCase
158
+ 4. **Empty Results**: Some fields may be null even if requested
159
+ 5. **Status Changes**: Trials change status frequently
160
+
161
+ ---
162
+
163
+ ## Example Enhanced Query
164
+
165
+ ```python
166
+ async def search_drug_repurposing_trials(
167
+ drug_name: str,
168
+ condition: str,
169
+ include_completed: bool = True,
170
+ ) -> list[Evidence]:
171
+ """Search for trials repurposing a drug for a new condition."""
172
+
173
+ statuses = ["RECRUITING", "ACTIVE_NOT_RECRUITING"]
174
+ if include_completed:
175
+ statuses.append("COMPLETED")
176
+
177
+ params = {
178
+ "query.intr": drug_name,
179
+ "query.cond": condition,
180
+ "filter.overallStatus": ",".join(statuses),
181
+ "filter.studyType": "INTERVENTIONAL",
182
+ "fields": ",".join(EXTENDED_FIELDS),
183
+ "pageSize": 50,
184
+ }
185
+ ```
186
+
187
+ ---
188
+
189
+ ## Sources
190
+
191
+ - [ClinicalTrials.gov API Documentation](https://clinicaltrials.gov/data-api/api)
192
+ - [CT.gov Field Definitions](https://clinicaltrials.gov/data-api/about-api/study-data-structure)
193
+ - [RxNorm API](https://lhncbc.nlm.nih.gov/RxNav/APIs/api-RxNorm.findRxcuiByString.html)
docs/brainstorming/03_EUROPEPMC_IMPROVEMENTS.md ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Europe PMC Tool: Current State & Future Improvements
2
+
3
+ **Status**: Currently Implemented (Replaced bioRxiv)
4
+ **Priority**: High (Preprint + Open Access Source)
5
+
6
+ ---
7
+
8
+ ## Why Europe PMC Over bioRxiv?
9
+
10
+ ### bioRxiv API Limitations (Why We Abandoned It)
11
+
12
+ 1. **No Search API**: Only returns papers by date range or DOI
13
+ 2. **No Query Capability**: Cannot search for "metformin cancer"
14
+ 3. **Workaround Required**: Would need to download ALL preprints and build local search
15
+ 4. **Known Issue**: [Gradio Issue #8861](https://github.com/gradio-app/gradio/issues/8861) documents the limitation
16
+
17
+ ### Europe PMC Advantages
18
+
19
+ 1. **Full Search API**: Boolean queries, filters, facets
20
+ 2. **Aggregates bioRxiv**: Includes bioRxiv, medRxiv content anyway
21
+ 3. **Includes PubMed**: Also has MEDLINE content
22
+ 4. **34 Preprint Servers**: Not just bioRxiv
23
+ 5. **Open Access Focus**: Full-text when available
24
+
25
+ ---
26
+
27
+ ## Current Implementation
28
+
29
+ ### What We Have (`src/tools/europepmc.py`)
30
+
31
+ - REST API search via `europepmc.org/webservices/rest/search`
32
+ - Preprint flagging via `firstPublicationDate` heuristics
33
+ - Returns: title, abstract, authors, DOI, source
34
+ - Marks preprints for transparency
35
+
36
+ ### Current Limitations
37
+
38
+ 1. **No Full-Text Retrieval**: Only metadata/abstracts
39
+ 2. **No Citation Network**: Missing references/citations
40
+ 3. **No Supplementary Files**: Not fetching figures/data
41
+ 4. **Basic Preprint Detection**: Heuristic, not explicit flag
42
+
43
+ ---
44
+
45
+ ## Europe PMC API Capabilities
46
+
47
+ ### Endpoints We Could Use
48
+
49
+ | Endpoint | Purpose | Currently Using |
50
+ |----------|---------|-----------------|
51
+ | `/search` | Query papers | Yes |
52
+ | `/fulltext/{ID}` | Full text (XML/JSON) | No |
53
+ | `/{PMCID}/supplementaryFiles` | Figures, data | No |
54
+ | `/citations/{ID}` | Who cited this | No |
55
+ | `/references/{ID}` | What this cites | No |
56
+ | `/annotations` | Text-mined entities | No |
57
+
58
+ ### Rich Query Syntax
59
+
60
+ ```python
61
+ # Current simple query
62
+ query = "metformin cancer"
63
+
64
+ # Could use advanced syntax
65
+ query = "(TITLE:metformin OR ABSTRACT:metformin) AND (cancer OR oncology)"
66
+ query += " AND (SRC:PPR)" # Only preprints
67
+ query += " AND (FIRST_PDATE:[2023-01-01 TO 2024-12-31])" # Date range
68
+ query += " AND (OPEN_ACCESS:y)" # Only open access
69
+ ```
70
+
71
+ ### Source Filters
72
+
73
+ ```python
74
+ # Filter by source
75
+ "SRC:MED" # MEDLINE
76
+ "SRC:PMC" # PubMed Central
77
+ "SRC:PPR" # Preprints (bioRxiv, medRxiv, etc.)
78
+ "SRC:AGR" # Agricola
79
+ "SRC:CBA" # Chinese Biological Abstracts
80
+ ```
81
+
82
+ ---
83
+
84
+ ## Recommended Improvements
85
+
86
+ ### Phase 1: Rich Metadata
87
+
88
+ ```python
89
+ # Add to search results
90
+ additional_fields = [
91
+ "citedByCount", # Impact indicator
92
+ "source", # Explicit source (MED, PMC, PPR)
93
+ "isOpenAccess", # Boolean flag
94
+ "fullTextUrlList", # URLs for full text
95
+ "authorAffiliations", # Institution info
96
+ "grantsList", # Funding info
97
+ ]
98
+ ```
99
+
100
+ ### Phase 2: Full-Text Retrieval
101
+
102
+ ```python
103
+ async def get_fulltext(pmcid: str) -> str | None:
104
+ """Get full text for open access papers."""
105
+ # XML format
106
+ url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
107
+ # Or JSON
108
+ url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextJSON"
109
+ ```
110
+
111
+ ### Phase 3: Citation Network
112
+
113
+ ```python
114
+ async def get_citations(pmcid: str) -> list[str]:
115
+ """Get papers that cite this one."""
116
+ url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/citations"
117
+
118
+ async def get_references(pmcid: str) -> list[str]:
119
+ """Get papers this one cites."""
120
+ url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/references"
121
+ ```
122
+
123
+ ### Phase 4: Text-Mined Annotations
124
+
125
+ Europe PMC extracts entities automatically:
126
+
127
+ ```python
128
+ async def get_annotations(pmcid: str) -> dict:
129
+ """Get text-mined entities (genes, diseases, drugs)."""
130
+ url = f"https://www.ebi.ac.uk/europepmc/annotations_api/annotationsByArticleIds"
131
+ params = {
132
+ "articleIds": f"PMC:{pmcid}",
133
+ "type": "Gene_Proteins,Diseases,Chemicals",
134
+ "format": "JSON",
135
+ }
136
+ # Returns structured entity mentions with positions
137
+ ```
138
+
139
+ ---
140
+
141
+ ## Supplementary File Retrieval
142
+
143
+ From reference repo (`bioinformatics_tools.py` lines 123-149):
144
+
145
+ ```python
146
+ def get_figures(pmcid: str) -> dict[str, str]:
147
+ """Download figures and supplementary files."""
148
+ url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/supplementaryFiles?includeInlineImage=true"
149
+ # Returns ZIP with images, returns base64-encoded
150
+ ```
151
+
152
+ ---
153
+
154
+ ## Preprint-Specific Features
155
+
156
+ ### Identify Preprint Servers
157
+
158
+ ```python
159
+ PREPRINT_SOURCES = {
160
+ "PPR": "General preprints",
161
+ "bioRxiv": "Biology preprints",
162
+ "medRxiv": "Medical preprints",
163
+ "chemRxiv": "Chemistry preprints",
164
+ "Research Square": "Multi-disciplinary",
165
+ "Preprints.org": "MDPI preprints",
166
+ }
167
+
168
+ # Check if published version exists
169
+ async def check_published_version(preprint_doi: str) -> str | None:
170
+ """Check if preprint has been peer-reviewed and published."""
171
+ # Europe PMC links preprints to final versions
172
+ ```
173
+
174
+ ---
175
+
176
+ ## Rate Limiting
177
+
178
+ Europe PMC is more generous than NCBI:
179
+
180
+ ```python
181
+ # No documented hard limit, but be respectful
182
+ # Recommend: 10-20 requests/second max
183
+ # Use email in User-Agent for polite pool
184
+ headers = {
185
+ "User-Agent": "DeepCritical/1.0 (mailto:your@email.com)"
186
+ }
187
+ ```
188
+
189
+ ---
190
+
191
+ ## vs. The Lens & OpenAlex
192
+
193
+ | Feature | Europe PMC | The Lens | OpenAlex |
194
+ |---------|------------|----------|----------|
195
+ | Biomedical Focus | Yes | Partial | Partial |
196
+ | Preprints | Yes (34 servers) | Yes | Yes |
197
+ | Full Text | PMC papers | Links | No |
198
+ | Citations | Yes | Yes | Yes |
199
+ | Annotations | Yes (text-mined) | No | No |
200
+ | Rate Limits | Generous | Moderate | Very generous |
201
+ | API Key | Optional | Required | Optional |
202
+
203
+ ---
204
+
205
+ ## Sources
206
+
207
+ - [Europe PMC REST API](https://europepmc.org/RestfulWebService)
208
+ - [Europe PMC Annotations API](https://europepmc.org/AnnotationsApi)
209
+ - [Europe PMC Articles API](https://europepmc.org/ArticlesApi)
210
+ - [rOpenSci medrxivr](https://docs.ropensci.org/medrxivr/)
211
+ - [bioRxiv TDM Resources](https://www.biorxiv.org/tdm)
docs/brainstorming/04_OPENALEX_INTEGRATION.md ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAlex Integration: The Missing Piece?
2
+
3
+ **Status**: NOT Implemented (Candidate for Addition)
4
+ **Priority**: HIGH - Could Replace Multiple Tools
5
+ **Reference**: Already implemented in `reference_repos/DeepCritical`
6
+
7
+ ---
8
+
9
+ ## What is OpenAlex?
10
+
11
+ OpenAlex is a **fully open** index of the global research system:
12
+
13
+ - **209M+ works** (papers, books, datasets)
14
+ - **2B+ author records** (disambiguated)
15
+ - **124K+ venues** (journals, repositories)
16
+ - **109K+ institutions**
17
+ - **65K+ concepts** (hierarchical, linked to Wikidata)
18
+
19
+ **Free. Open. No API key required.**
20
+
21
+ ---
22
+
23
+ ## Why OpenAlex for DeepCritical?
24
+
25
+ ### Current Architecture
26
+
27
+ ```
28
+ User Query
29
+
30
+ ┌──────────────────────────────────────┐
31
+ │ PubMed ClinicalTrials Europe PMC │ ← 3 separate APIs
32
+ └──────────────────────────────────────┘
33
+
34
+ Orchestrator (deduplicate, judge, synthesize)
35
+ ```
36
+
37
+ ### With OpenAlex
38
+
39
+ ```
40
+ User Query
41
+
42
+ ┌──────────────────────────────────────┐
43
+ │ OpenAlex │ ← Single API
44
+ │ (includes PubMed + preprints + │
45
+ │ citations + concepts + authors) │
46
+ └──────────────────────────────────────┘
47
+
48
+ Orchestrator (enrich with CT.gov for trials)
49
+ ```
50
+
51
+ **OpenAlex already aggregates**:
52
+ - PubMed/MEDLINE
53
+ - Crossref
54
+ - ORCID
55
+ - Unpaywall (open access links)
56
+ - Microsoft Academic Graph (legacy)
57
+ - Preprint servers
58
+
59
+ ---
60
+
61
+ ## Reference Implementation
62
+
63
+ From `reference_repos/DeepCritical/DeepResearch/src/tools/openalex_tools.py`:
64
+
65
+ ```python
66
+ class OpenAlexFetchTool(ToolRunner):
67
+ def __init__(self):
68
+ super().__init__(
69
+ ToolSpec(
70
+ name="openalex_fetch",
71
+ description="Fetch OpenAlex work or author",
72
+ inputs={"entity": "TEXT", "identifier": "TEXT"},
73
+ outputs={"result": "JSON"},
74
+ )
75
+ )
76
+
77
+ def run(self, params: dict[str, Any]) -> ExecutionResult:
78
+ entity = params["entity"] # "works", "authors", "venues"
79
+ identifier = params["identifier"]
80
+ base = "https://api.openalex.org"
81
+ url = f"{base}/{entity}/{identifier}"
82
+ resp = requests.get(url, timeout=30)
83
+ return ExecutionResult(success=True, data={"result": resp.json()})
84
+ ```
85
+
86
+ ---
87
+
88
+ ## OpenAlex API Features
89
+
90
+ ### Search Works (Papers)
91
+
92
+ ```python
93
+ # Search for metformin + cancer papers
94
+ url = "https://api.openalex.org/works"
95
+ params = {
96
+ "search": "metformin cancer drug repurposing",
97
+ "filter": "publication_year:>2020,type:article",
98
+ "sort": "cited_by_count:desc",
99
+ "per_page": 50,
100
+ }
101
+ ```
102
+
103
+ ### Rich Filtering
104
+
105
+ ```python
106
+ # Filter examples
107
+ "publication_year:2023"
108
+ "type:article" # vs preprint, book, etc.
109
+ "is_oa:true" # Open access only
110
+ "concepts.id:C71924100" # Papers about "Medicine"
111
+ "authorships.institutions.id:I27837315" # From Harvard
112
+ "cited_by_count:>100" # Highly cited
113
+ "has_fulltext:true" # Full text available
114
+ ```
115
+
116
+ ### What You Get Back
117
+
118
+ ```json
119
+ {
120
+ "id": "W2741809807",
121
+ "title": "Metformin: A candidate drug for...",
122
+ "publication_year": 2023,
123
+ "type": "article",
124
+ "cited_by_count": 45,
125
+ "is_oa": true,
126
+ "primary_location": {
127
+ "source": {"display_name": "Nature Medicine"},
128
+ "pdf_url": "https://...",
129
+ "landing_page_url": "https://..."
130
+ },
131
+ "concepts": [
132
+ {"id": "C71924100", "display_name": "Medicine", "score": 0.95},
133
+ {"id": "C54355233", "display_name": "Pharmacology", "score": 0.88}
134
+ ],
135
+ "authorships": [
136
+ {
137
+ "author": {"id": "A123", "display_name": "John Smith"},
138
+ "institutions": [{"display_name": "Harvard Medical School"}]
139
+ }
140
+ ],
141
+ "referenced_works": ["W123", "W456"], # Citations
142
+ "related_works": ["W789", "W012"] # Similar papers
143
+ }
144
+ ```
145
+
146
+ ---
147
+
148
+ ## Key Advantages Over Current Tools
149
+
150
+ ### 1. Citation Network (We Don't Have This!)
151
+
152
+ ```python
153
+ # Get papers that cite a work
154
+ url = f"https://api.openalex.org/works?filter=cites:{work_id}"
155
+
156
+ # Get papers cited by a work
157
+ # Already in `referenced_works` field
158
+ ```
159
+
160
+ ### 2. Concept Tagging (We Don't Have This!)
161
+
162
+ OpenAlex auto-tags papers with hierarchical concepts:
163
+ - "Medicine" → "Pharmacology" → "Drug Repurposing"
164
+ - Can search by concept, not just keywords
165
+
166
+ ### 3. Author Disambiguation (We Don't Have This!)
167
+
168
+ ```python
169
+ # Find all works by an author
170
+ url = f"https://api.openalex.org/works?filter=authorships.author.id:{author_id}"
171
+ ```
172
+
173
+ ### 4. Institution Tracking
174
+
175
+ ```python
176
+ # Find drug repurposing papers from top institutions
177
+ url = "https://api.openalex.org/works"
178
+ params = {
179
+ "search": "drug repurposing",
180
+ "filter": "authorships.institutions.id:I27837315", # Harvard
181
+ }
182
+ ```
183
+
184
+ ### 5. Related Works
185
+
186
+ Each paper comes with `related_works` - semantically similar papers discovered by OpenAlex's ML.
187
+
188
+ ---
189
+
190
+ ## Proposed Implementation
191
+
192
+ ### New Tool: `src/tools/openalex.py`
193
+
194
+ ```python
195
+ """OpenAlex search tool for comprehensive scholarly data."""
196
+
197
+ import httpx
198
+ from src.tools.base import SearchTool
199
+ from src.utils.models import Evidence
200
+
201
+ class OpenAlexTool(SearchTool):
202
+ """Search OpenAlex for scholarly works with rich metadata."""
203
+
204
+ name = "openalex"
205
+
206
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
207
+ async with httpx.AsyncClient() as client:
208
+ resp = await client.get(
209
+ "https://api.openalex.org/works",
210
+ params={
211
+ "search": query,
212
+ "filter": "type:article,is_oa:true",
213
+ "sort": "cited_by_count:desc",
214
+ "per_page": max_results,
215
+ "mailto": "deepcritical@example.com", # Polite pool
216
+ },
217
+ )
218
+ data = resp.json()
219
+
220
+ return [
221
+ Evidence(
222
+ source="openalex",
223
+ title=work["title"],
224
+ abstract=work.get("abstract", ""),
225
+ url=work["primary_location"]["landing_page_url"],
226
+ metadata={
227
+ "cited_by_count": work["cited_by_count"],
228
+ "concepts": [c["display_name"] for c in work["concepts"][:5]],
229
+ "is_open_access": work["is_oa"],
230
+ "pdf_url": work["primary_location"].get("pdf_url"),
231
+ },
232
+ )
233
+ for work in data["results"]
234
+ ]
235
+ ```
236
+
237
+ ---
238
+
239
+ ## Rate Limits
240
+
241
+ OpenAlex is **extremely generous**:
242
+
243
+ - No hard rate limit documented
244
+ - Recommended: <100,000 requests/day
245
+ - **Polite pool**: Add `mailto=your@email.com` param for faster responses
246
+ - No API key required (optional for priority support)
247
+
248
+ ---
249
+
250
+ ## Should We Add OpenAlex?
251
+
252
+ ### Arguments FOR
253
+
254
+ 1. **Already in reference repo** - proven pattern
255
+ 2. **Richer data** - citations, concepts, authors
256
+ 3. **Single source** - reduces API complexity
257
+ 4. **Free & open** - no keys, no limits
258
+ 5. **Institution adoption** - Leiden, Sorbonne switched to it
259
+
260
+ ### Arguments AGAINST
261
+
262
+ 1. **Adds complexity** - another data source
263
+ 2. **Overlap** - duplicates some PubMed data
264
+ 3. **Not biomedical-focused** - covers all disciplines
265
+ 4. **No full text** - still need PMC/Europe PMC for that
266
+
267
+ ### Recommendation
268
+
269
+ **Add OpenAlex as a 4th source**, don't replace existing tools.
270
+
271
+ Use it for:
272
+ - Citation network analysis
273
+ - Concept-based discovery
274
+ - High-impact paper finding
275
+ - Author/institution tracking
276
+
277
+ Keep PubMed, ClinicalTrials, Europe PMC for:
278
+ - Authoritative biomedical search
279
+ - Clinical trial data
280
+ - Full-text access
281
+ - Preprint tracking
282
+
283
+ ---
284
+
285
+ ## Implementation Priority
286
+
287
+ | Task | Effort | Value |
288
+ |------|--------|-------|
289
+ | Basic search | Low | High |
290
+ | Citation network | Medium | Very High |
291
+ | Concept filtering | Low | High |
292
+ | Related works | Low | High |
293
+ | Author tracking | Medium | Medium |
294
+
295
+ ---
296
+
297
+ ## Sources
298
+
299
+ - [OpenAlex Documentation](https://docs.openalex.org)
300
+ - [OpenAlex API Overview](https://docs.openalex.org/api)
301
+ - [OpenAlex Wikipedia](https://en.wikipedia.org/wiki/OpenAlex)
302
+ - [Leiden University Announcement](https://www.leidenranking.com/information/openalex)
303
+ - [OpenAlex: A fully-open index (Paper)](https://arxiv.org/abs/2205.01833)
docs/brainstorming/implementation/15_PHASE_OPENALEX.md ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 15: OpenAlex Integration
2
+
3
+ **Priority**: HIGH - Biggest bang for buck
4
+ **Effort**: ~2-3 hours
5
+ **Dependencies**: None (existing codebase patterns sufficient)
6
+
7
+ ---
8
+
9
+ ## Prerequisites (COMPLETED)
10
+
11
+ The following model changes have been implemented to support this integration:
12
+
13
+ 1. **`SourceName` Literal Updated** (`src/utils/models.py:9`)
14
+ ```python
15
+ SourceName = Literal["pubmed", "clinicaltrials", "europepmc", "preprint", "openalex"]
16
+ ```
17
+ - Without this, `source="openalex"` would fail Pydantic validation
18
+
19
+ 2. **`Evidence.metadata` Field Added** (`src/utils/models.py:39-42`)
20
+ ```python
21
+ metadata: dict[str, Any] = Field(
22
+ default_factory=dict,
23
+ description="Additional metadata (e.g., cited_by_count, concepts, is_open_access)",
24
+ )
25
+ ```
26
+ - Required for storing `cited_by_count`, `concepts`, etc.
27
+ - Model is still frozen - metadata must be passed at construction time
28
+
29
+ 3. **`__init__.py` Exports Updated** (`src/tools/__init__.py`)
30
+ - All tools are now exported: `ClinicalTrialsTool`, `EuropePMCTool`, `PubMedTool`
31
+ - OpenAlexTool should be added here after implementation
32
+
33
+ ---
34
+
35
+ ## Overview
36
+
37
+ Add OpenAlex as a 4th data source for comprehensive scholarly data including:
38
+ - Citation networks (who cites whom)
39
+ - Concept tagging (hierarchical topic classification)
40
+ - Author disambiguation
41
+ - 209M+ works indexed
42
+
43
+ **Why OpenAlex?**
44
+ - Free, no API key required
45
+ - Already implemented in reference repo
46
+ - Provides citation data we don't have
47
+ - Aggregates PubMed + preprints + more
48
+
49
+ ---
50
+
51
+ ## TDD Implementation Plan
52
+
53
+ ### Step 1: Write the Tests First
54
+
55
+ **File**: `tests/unit/tools/test_openalex.py`
56
+
57
+ ```python
58
+ """Tests for OpenAlex search tool."""
59
+
60
+ import pytest
61
+ import respx
62
+ from httpx import Response
63
+
64
+ from src.tools.openalex import OpenAlexTool
65
+ from src.utils.models import Evidence
66
+
67
+
68
+ class TestOpenAlexTool:
69
+ """Test suite for OpenAlex search functionality."""
70
+
71
+ @pytest.fixture
72
+ def tool(self) -> OpenAlexTool:
73
+ return OpenAlexTool()
74
+
75
+ def test_name_property(self, tool: OpenAlexTool) -> None:
76
+ """Tool should identify itself as 'openalex'."""
77
+ assert tool.name == "openalex"
78
+
79
+ @respx.mock
80
+ @pytest.mark.asyncio
81
+ async def test_search_returns_evidence(self, tool: OpenAlexTool) -> None:
82
+ """Search should return list of Evidence objects."""
83
+ mock_response = {
84
+ "results": [
85
+ {
86
+ "id": "W2741809807",
87
+ "title": "Metformin and cancer: A systematic review",
88
+ "publication_year": 2023,
89
+ "cited_by_count": 45,
90
+ "type": "article",
91
+ "is_oa": True,
92
+ "primary_location": {
93
+ "source": {"display_name": "Nature Medicine"},
94
+ "landing_page_url": "https://doi.org/10.1038/example",
95
+ "pdf_url": None,
96
+ },
97
+ "abstract_inverted_index": {
98
+ "Metformin": [0],
99
+ "shows": [1],
100
+ "anticancer": [2],
101
+ "effects": [3],
102
+ },
103
+ "concepts": [
104
+ {"display_name": "Medicine", "score": 0.95},
105
+ {"display_name": "Oncology", "score": 0.88},
106
+ ],
107
+ "authorships": [
108
+ {
109
+ "author": {"display_name": "John Smith"},
110
+ "institutions": [{"display_name": "Harvard"}],
111
+ }
112
+ ],
113
+ }
114
+ ]
115
+ }
116
+
117
+ respx.get("https://api.openalex.org/works").mock(
118
+ return_value=Response(200, json=mock_response)
119
+ )
120
+
121
+ results = await tool.search("metformin cancer", max_results=10)
122
+
123
+ assert len(results) == 1
124
+ assert isinstance(results[0], Evidence)
125
+ assert "Metformin and cancer" in results[0].citation.title
126
+ assert results[0].citation.source == "openalex"
127
+
128
+ @respx.mock
129
+ @pytest.mark.asyncio
130
+ async def test_search_empty_results(self, tool: OpenAlexTool) -> None:
131
+ """Search with no results should return empty list."""
132
+ respx.get("https://api.openalex.org/works").mock(
133
+ return_value=Response(200, json={"results": []})
134
+ )
135
+
136
+ results = await tool.search("xyznonexistentquery123")
137
+ assert results == []
138
+
139
+ @respx.mock
140
+ @pytest.mark.asyncio
141
+ async def test_search_handles_missing_abstract(self, tool: OpenAlexTool) -> None:
142
+ """Tool should handle papers without abstracts."""
143
+ mock_response = {
144
+ "results": [
145
+ {
146
+ "id": "W123",
147
+ "title": "Paper without abstract",
148
+ "publication_year": 2023,
149
+ "cited_by_count": 10,
150
+ "type": "article",
151
+ "is_oa": False,
152
+ "primary_location": {
153
+ "source": {"display_name": "Journal"},
154
+ "landing_page_url": "https://example.com",
155
+ },
156
+ "abstract_inverted_index": None,
157
+ "concepts": [],
158
+ "authorships": [],
159
+ }
160
+ ]
161
+ }
162
+
163
+ respx.get("https://api.openalex.org/works").mock(
164
+ return_value=Response(200, json=mock_response)
165
+ )
166
+
167
+ results = await tool.search("test query")
168
+ assert len(results) == 1
169
+ assert results[0].content == "" # No abstract
170
+
171
+ @respx.mock
172
+ @pytest.mark.asyncio
173
+ async def test_search_extracts_citation_count(self, tool: OpenAlexTool) -> None:
174
+ """Citation count should be in metadata."""
175
+ mock_response = {
176
+ "results": [
177
+ {
178
+ "id": "W456",
179
+ "title": "Highly cited paper",
180
+ "publication_year": 2020,
181
+ "cited_by_count": 500,
182
+ "type": "article",
183
+ "is_oa": True,
184
+ "primary_location": {
185
+ "source": {"display_name": "Science"},
186
+ "landing_page_url": "https://example.com",
187
+ },
188
+ "abstract_inverted_index": {"Test": [0]},
189
+ "concepts": [],
190
+ "authorships": [],
191
+ }
192
+ ]
193
+ }
194
+
195
+ respx.get("https://api.openalex.org/works").mock(
196
+ return_value=Response(200, json=mock_response)
197
+ )
198
+
199
+ results = await tool.search("highly cited")
200
+ assert results[0].metadata["cited_by_count"] == 500
201
+
202
+ @respx.mock
203
+ @pytest.mark.asyncio
204
+ async def test_search_extracts_concepts(self, tool: OpenAlexTool) -> None:
205
+ """Concepts should be extracted for semantic discovery."""
206
+ mock_response = {
207
+ "results": [
208
+ {
209
+ "id": "W789",
210
+ "title": "Drug repurposing study",
211
+ "publication_year": 2023,
212
+ "cited_by_count": 25,
213
+ "type": "article",
214
+ "is_oa": True,
215
+ "primary_location": {
216
+ "source": {"display_name": "PLOS ONE"},
217
+ "landing_page_url": "https://example.com",
218
+ },
219
+ "abstract_inverted_index": {"Drug": [0], "repurposing": [1]},
220
+ "concepts": [
221
+ {"display_name": "Pharmacology", "score": 0.92},
222
+ {"display_name": "Drug Discovery", "score": 0.85},
223
+ {"display_name": "Medicine", "score": 0.80},
224
+ ],
225
+ "authorships": [],
226
+ }
227
+ ]
228
+ }
229
+
230
+ respx.get("https://api.openalex.org/works").mock(
231
+ return_value=Response(200, json=mock_response)
232
+ )
233
+
234
+ results = await tool.search("drug repurposing")
235
+ assert "Pharmacology" in results[0].metadata["concepts"]
236
+ assert "Drug Discovery" in results[0].metadata["concepts"]
237
+
238
+ @respx.mock
239
+ @pytest.mark.asyncio
240
+ async def test_search_api_error_raises_search_error(
241
+ self, tool: OpenAlexTool
242
+ ) -> None:
243
+ """API errors should raise SearchError."""
244
+ from src.utils.exceptions import SearchError
245
+
246
+ respx.get("https://api.openalex.org/works").mock(
247
+ return_value=Response(500, text="Internal Server Error")
248
+ )
249
+
250
+ with pytest.raises(SearchError):
251
+ await tool.search("test query")
252
+
253
+ def test_reconstruct_abstract(self, tool: OpenAlexTool) -> None:
254
+ """Test abstract reconstruction from inverted index."""
255
+ inverted_index = {
256
+ "Metformin": [0, 5],
257
+ "is": [1],
258
+ "a": [2],
259
+ "diabetes": [3],
260
+ "drug": [4],
261
+ "effective": [6],
262
+ }
263
+ abstract = tool._reconstruct_abstract(inverted_index)
264
+ assert abstract == "Metformin is a diabetes drug Metformin effective"
265
+ ```
266
+
267
+ ---
268
+
269
+ ### Step 2: Create the Implementation
270
+
271
+ **File**: `src/tools/openalex.py`
272
+
273
+ ```python
274
+ """OpenAlex search tool for comprehensive scholarly data."""
275
+
276
+ from typing import Any
277
+
278
+ import httpx
279
+ from tenacity import retry, stop_after_attempt, wait_exponential
280
+
281
+ from src.utils.exceptions import SearchError
282
+ from src.utils.models import Citation, Evidence
283
+
284
+
285
+ class OpenAlexTool:
286
+ """
287
+ Search OpenAlex for scholarly works with rich metadata.
288
+
289
+ OpenAlex provides:
290
+ - 209M+ scholarly works
291
+ - Citation counts and networks
292
+ - Concept tagging (hierarchical)
293
+ - Author disambiguation
294
+ - Open access links
295
+
296
+ API Docs: https://docs.openalex.org/
297
+ """
298
+
299
+ BASE_URL = "https://api.openalex.org/works"
300
+
301
+ def __init__(self, email: str | None = None) -> None:
302
+ """
303
+ Initialize OpenAlex tool.
304
+
305
+ Args:
306
+ email: Optional email for polite pool (faster responses)
307
+ """
308
+ self.email = email or "deepcritical@example.com"
309
+
310
+ @property
311
+ def name(self) -> str:
312
+ return "openalex"
313
+
314
+ @retry(
315
+ stop=stop_after_attempt(3),
316
+ wait=wait_exponential(multiplier=1, min=1, max=10),
317
+ reraise=True,
318
+ )
319
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
320
+ """
321
+ Search OpenAlex for scholarly works.
322
+
323
+ Args:
324
+ query: Search terms
325
+ max_results: Maximum results to return (max 200 per request)
326
+
327
+ Returns:
328
+ List of Evidence objects with citation metadata
329
+
330
+ Raises:
331
+ SearchError: If API request fails
332
+ """
333
+ params = {
334
+ "search": query,
335
+ "filter": "type:article", # Only peer-reviewed articles
336
+ "sort": "cited_by_count:desc", # Most cited first
337
+ "per_page": min(max_results, 200),
338
+ "mailto": self.email, # Polite pool for faster responses
339
+ }
340
+
341
+ async with httpx.AsyncClient(timeout=30.0) as client:
342
+ try:
343
+ response = await client.get(self.BASE_URL, params=params)
344
+ response.raise_for_status()
345
+
346
+ data = response.json()
347
+ results = data.get("results", [])
348
+
349
+ return [self._to_evidence(work) for work in results[:max_results]]
350
+
351
+ except httpx.HTTPStatusError as e:
352
+ raise SearchError(f"OpenAlex API error: {e}") from e
353
+ except httpx.RequestError as e:
354
+ raise SearchError(f"OpenAlex connection failed: {e}") from e
355
+
356
+ def _to_evidence(self, work: dict[str, Any]) -> Evidence:
357
+ """Convert OpenAlex work to Evidence object."""
358
+ title = work.get("title", "Untitled")
359
+ pub_year = work.get("publication_year", "Unknown")
360
+ cited_by = work.get("cited_by_count", 0)
361
+ is_oa = work.get("is_oa", False)
362
+
363
+ # Reconstruct abstract from inverted index
364
+ abstract_index = work.get("abstract_inverted_index")
365
+ abstract = self._reconstruct_abstract(abstract_index) if abstract_index else ""
366
+
367
+ # Extract concepts (top 5)
368
+ concepts = [
369
+ c.get("display_name", "")
370
+ for c in work.get("concepts", [])[:5]
371
+ if c.get("display_name")
372
+ ]
373
+
374
+ # Extract authors (top 5)
375
+ authorships = work.get("authorships", [])
376
+ authors = [
377
+ a.get("author", {}).get("display_name", "")
378
+ for a in authorships[:5]
379
+ if a.get("author", {}).get("display_name")
380
+ ]
381
+
382
+ # Get URL
383
+ primary_loc = work.get("primary_location") or {}
384
+ url = primary_loc.get("landing_page_url", "")
385
+ if not url:
386
+ # Fallback to OpenAlex page
387
+ work_id = work.get("id", "").replace("https://openalex.org/", "")
388
+ url = f"https://openalex.org/{work_id}"
389
+
390
+ return Evidence(
391
+ content=abstract[:2000],
392
+ citation=Citation(
393
+ source="openalex",
394
+ title=title[:500],
395
+ url=url,
396
+ date=str(pub_year),
397
+ authors=authors,
398
+ ),
399
+ relevance=min(0.9, 0.5 + (cited_by / 1000)), # Boost by citations
400
+ metadata={
401
+ "cited_by_count": cited_by,
402
+ "is_open_access": is_oa,
403
+ "concepts": concepts,
404
+ "pdf_url": primary_loc.get("pdf_url"),
405
+ },
406
+ )
407
+
408
+ def _reconstruct_abstract(
409
+ self, inverted_index: dict[str, list[int]]
410
+ ) -> str:
411
+ """
412
+ Reconstruct abstract from OpenAlex inverted index format.
413
+
414
+ OpenAlex stores abstracts as {"word": [position1, position2, ...]}.
415
+ This rebuilds the original text.
416
+ """
417
+ if not inverted_index:
418
+ return ""
419
+
420
+ # Build position -> word mapping
421
+ position_word: dict[int, str] = {}
422
+ for word, positions in inverted_index.items():
423
+ for pos in positions:
424
+ position_word[pos] = word
425
+
426
+ # Reconstruct in order
427
+ if not position_word:
428
+ return ""
429
+
430
+ max_pos = max(position_word.keys())
431
+ words = [position_word.get(i, "") for i in range(max_pos + 1)]
432
+ return " ".join(w for w in words if w)
433
+ ```
434
+
435
+ ---
436
+
437
+ ### Step 3: Register in Search Handler
438
+
439
+ **File**: `src/tools/search_handler.py` (add to imports and tool list)
440
+
441
+ ```python
442
+ # Add import
443
+ from src.tools.openalex import OpenAlexTool
444
+
445
+ # Add to _create_tools method
446
+ def _create_tools(self) -> list[SearchTool]:
447
+ return [
448
+ PubMedTool(),
449
+ ClinicalTrialsTool(),
450
+ EuropePMCTool(),
451
+ OpenAlexTool(), # NEW
452
+ ]
453
+ ```
454
+
455
+ ---
456
+
457
+ ### Step 4: Update `__init__.py`
458
+
459
+ **File**: `src/tools/__init__.py`
460
+
461
+ ```python
462
+ from src.tools.openalex import OpenAlexTool
463
+
464
+ __all__ = [
465
+ "PubMedTool",
466
+ "ClinicalTrialsTool",
467
+ "EuropePMCTool",
468
+ "OpenAlexTool", # NEW
469
+ # ...
470
+ ]
471
+ ```
472
+
473
+ ---
474
+
475
+ ## Demo Script
476
+
477
+ **File**: `examples/openalex_demo.py`
478
+
479
+ ```python
480
+ #!/usr/bin/env python3
481
+ """Demo script to verify OpenAlex integration."""
482
+
483
+ import asyncio
484
+ from src.tools.openalex import OpenAlexTool
485
+
486
+
487
+ async def main():
488
+ """Run OpenAlex search demo."""
489
+ tool = OpenAlexTool()
490
+
491
+ print("=" * 60)
492
+ print("OpenAlex Integration Demo")
493
+ print("=" * 60)
494
+
495
+ # Test 1: Basic drug repurposing search
496
+ print("\n[Test 1] Searching for 'metformin cancer drug repurposing'...")
497
+ results = await tool.search("metformin cancer drug repurposing", max_results=5)
498
+
499
+ for i, evidence in enumerate(results, 1):
500
+ print(f"\n--- Result {i} ---")
501
+ print(f"Title: {evidence.citation.title}")
502
+ print(f"Year: {evidence.citation.date}")
503
+ print(f"Citations: {evidence.metadata.get('cited_by_count', 'N/A')}")
504
+ print(f"Concepts: {', '.join(evidence.metadata.get('concepts', []))}")
505
+ print(f"Open Access: {evidence.metadata.get('is_open_access', False)}")
506
+ print(f"URL: {evidence.citation.url}")
507
+ if evidence.content:
508
+ print(f"Abstract: {evidence.content[:200]}...")
509
+
510
+ # Test 2: High-impact papers
511
+ print("\n" + "=" * 60)
512
+ print("[Test 2] Finding highly-cited papers on 'long COVID treatment'...")
513
+ results = await tool.search("long COVID treatment", max_results=3)
514
+
515
+ for evidence in results:
516
+ print(f"\n- {evidence.citation.title}")
517
+ print(f" Citations: {evidence.metadata.get('cited_by_count', 0)}")
518
+
519
+ print("\n" + "=" * 60)
520
+ print("Demo complete!")
521
+
522
+
523
+ if __name__ == "__main__":
524
+ asyncio.run(main())
525
+ ```
526
+
527
+ ---
528
+
529
+ ## Verification Checklist
530
+
531
+ ### Unit Tests
532
+ ```bash
533
+ # Run just OpenAlex tests
534
+ uv run pytest tests/unit/tools/test_openalex.py -v
535
+
536
+ # Expected: All tests pass
537
+ ```
538
+
539
+ ### Integration Test (Manual)
540
+ ```bash
541
+ # Run demo script with real API
542
+ uv run python examples/openalex_demo.py
543
+
544
+ # Expected: Real results from OpenAlex API
545
+ ```
546
+
547
+ ### Full Test Suite
548
+ ```bash
549
+ # Ensure nothing broke
550
+ make check
551
+
552
+ # Expected: All 110+ tests pass, mypy clean
553
+ ```
554
+
555
+ ---
556
+
557
+ ## Success Criteria
558
+
559
+ 1. **Unit tests pass**: All mocked tests in `test_openalex.py` pass
560
+ 2. **Integration works**: Demo script returns real results
561
+ 3. **No regressions**: `make check` passes completely
562
+ 4. **SearchHandler integration**: OpenAlex appears in search results alongside other sources
563
+ 5. **Citation metadata**: Results include `cited_by_count`, `concepts`, `is_open_access`
564
+
565
+ ---
566
+
567
+ ## Future Enhancements (P2)
568
+
569
+ Once basic integration works:
570
+
571
+ 1. **Citation Network Queries**
572
+ ```python
573
+ # Get papers citing a specific work
574
+ async def get_citing_works(self, work_id: str) -> list[Evidence]:
575
+ params = {"filter": f"cites:{work_id}"}
576
+ ...
577
+ ```
578
+
579
+ 2. **Concept-Based Search**
580
+ ```python
581
+ # Search by OpenAlex concept ID
582
+ async def search_by_concept(self, concept_id: str) -> list[Evidence]:
583
+ params = {"filter": f"concepts.id:{concept_id}"}
584
+ ...
585
+ ```
586
+
587
+ 3. **Author Tracking**
588
+ ```python
589
+ # Find all works by an author
590
+ async def search_by_author(self, author_id: str) -> list[Evidence]:
591
+ params = {"filter": f"authorships.author.id:{author_id}"}
592
+ ...
593
+ ```
594
+
595
+ ---
596
+
597
+ ## Notes
598
+
599
+ - OpenAlex is **very generous** with rate limits (no documented hard limit)
600
+ - Adding `mailto` parameter gives priority access (polite pool)
601
+ - Abstract is stored as inverted index - must reconstruct
602
+ - Citation count is a good proxy for paper quality/impact
603
+ - Consider caching responses for repeated queries
docs/brainstorming/implementation/16_PHASE_PUBMED_FULLTEXT.md ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 16: PubMed Full-Text Retrieval
2
+
3
+ **Priority**: MEDIUM - Enhances evidence quality
4
+ **Effort**: ~3 hours
5
+ **Dependencies**: None (existing PubMed tool sufficient)
6
+
7
+ ---
8
+
9
+ ## Prerequisites (COMPLETED)
10
+
11
+ The `Evidence.metadata` field has been added to `src/utils/models.py` to support:
12
+ ```python
13
+ metadata={"has_fulltext": True}
14
+ ```
15
+
16
+ ---
17
+
18
+ ## Architecture Decision: Constructor Parameter vs Method Parameter
19
+
20
+ **IMPORTANT**: The original spec proposed `include_fulltext` as a method parameter:
21
+ ```python
22
+ # WRONG - SearchHandler won't pass this parameter
23
+ async def search(self, query: str, max_results: int = 10, include_fulltext: bool = False):
24
+ ```
25
+
26
+ **Problem**: `SearchHandler` calls `tool.search(query, max_results)` uniformly across all tools.
27
+ It has no mechanism to pass tool-specific parameters like `include_fulltext`.
28
+
29
+ **Solution**: Use constructor parameter instead:
30
+ ```python
31
+ # CORRECT - Configured at instantiation time
32
+ class PubMedTool:
33
+ def __init__(self, api_key: str | None = None, include_fulltext: bool = False):
34
+ self.include_fulltext = include_fulltext
35
+ ...
36
+ ```
37
+
38
+ This way, you can create a full-text-enabled PubMed tool:
39
+ ```python
40
+ # In orchestrator or wherever tools are created
41
+ tools = [
42
+ PubMedTool(include_fulltext=True), # Full-text enabled
43
+ ClinicalTrialsTool(),
44
+ EuropePMCTool(),
45
+ ]
46
+ ```
47
+
48
+ ---
49
+
50
+ ## Overview
51
+
52
+ Add full-text retrieval for PubMed papers via the BioC API, enabling:
53
+ - Complete paper text for open-access PMC papers
54
+ - Structured sections (intro, methods, results, discussion)
55
+ - Better evidence for LLM synthesis
56
+
57
+ **Why Full-Text?**
58
+ - Abstracts only give ~200-300 words
59
+ - Full text provides detailed methods, results, figures
60
+ - Reference repo already has this implemented
61
+ - Makes LLM judgments more accurate
62
+
63
+ ---
64
+
65
+ ## TDD Implementation Plan
66
+
67
+ ### Step 1: Write the Tests First
68
+
69
+ **File**: `tests/unit/tools/test_pubmed_fulltext.py`
70
+
71
+ ```python
72
+ """Tests for PubMed full-text retrieval."""
73
+
74
+ import pytest
75
+ import respx
76
+ from httpx import Response
77
+
78
+ from src.tools.pubmed import PubMedTool
79
+
80
+
81
+ class TestPubMedFullText:
82
+ """Test suite for PubMed full-text functionality."""
83
+
84
+ @pytest.fixture
85
+ def tool(self) -> PubMedTool:
86
+ return PubMedTool()
87
+
88
+ @respx.mock
89
+ @pytest.mark.asyncio
90
+ async def test_get_pmc_id_success(self, tool: PubMedTool) -> None:
91
+ """Should convert PMID to PMCID for full-text access."""
92
+ mock_response = {
93
+ "records": [
94
+ {
95
+ "pmid": "12345678",
96
+ "pmcid": "PMC1234567",
97
+ }
98
+ ]
99
+ }
100
+
101
+ respx.get("https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/").mock(
102
+ return_value=Response(200, json=mock_response)
103
+ )
104
+
105
+ pmcid = await tool.get_pmc_id("12345678")
106
+ assert pmcid == "PMC1234567"
107
+
108
+ @respx.mock
109
+ @pytest.mark.asyncio
110
+ async def test_get_pmc_id_not_in_pmc(self, tool: PubMedTool) -> None:
111
+ """Should return None if paper not in PMC."""
112
+ mock_response = {
113
+ "records": [
114
+ {
115
+ "pmid": "12345678",
116
+ # No pmcid means not in PMC
117
+ }
118
+ ]
119
+ }
120
+
121
+ respx.get("https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/").mock(
122
+ return_value=Response(200, json=mock_response)
123
+ )
124
+
125
+ pmcid = await tool.get_pmc_id("12345678")
126
+ assert pmcid is None
127
+
128
+ @respx.mock
129
+ @pytest.mark.asyncio
130
+ async def test_get_fulltext_success(self, tool: PubMedTool) -> None:
131
+ """Should retrieve full text for PMC papers."""
132
+ # Mock BioC API response
133
+ mock_bioc = {
134
+ "documents": [
135
+ {
136
+ "passages": [
137
+ {
138
+ "infons": {"section_type": "INTRO"},
139
+ "text": "Introduction text here.",
140
+ },
141
+ {
142
+ "infons": {"section_type": "METHODS"},
143
+ "text": "Methods description here.",
144
+ },
145
+ {
146
+ "infons": {"section_type": "RESULTS"},
147
+ "text": "Results summary here.",
148
+ },
149
+ {
150
+ "infons": {"section_type": "DISCUSS"},
151
+ "text": "Discussion and conclusions.",
152
+ },
153
+ ]
154
+ }
155
+ ]
156
+ }
157
+
158
+ respx.get(
159
+ "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/12345678/unicode"
160
+ ).mock(return_value=Response(200, json=mock_bioc))
161
+
162
+ fulltext = await tool.get_fulltext("12345678")
163
+
164
+ assert fulltext is not None
165
+ assert "Introduction text here" in fulltext
166
+ assert "Methods description here" in fulltext
167
+ assert "Results summary here" in fulltext
168
+
169
+ @respx.mock
170
+ @pytest.mark.asyncio
171
+ async def test_get_fulltext_not_available(self, tool: PubMedTool) -> None:
172
+ """Should return None if full text not available."""
173
+ respx.get(
174
+ "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/99999999/unicode"
175
+ ).mock(return_value=Response(404))
176
+
177
+ fulltext = await tool.get_fulltext("99999999")
178
+ assert fulltext is None
179
+
180
+ @respx.mock
181
+ @pytest.mark.asyncio
182
+ async def test_get_fulltext_structured(self, tool: PubMedTool) -> None:
183
+ """Should return structured sections dict."""
184
+ mock_bioc = {
185
+ "documents": [
186
+ {
187
+ "passages": [
188
+ {"infons": {"section_type": "INTRO"}, "text": "Intro..."},
189
+ {"infons": {"section_type": "METHODS"}, "text": "Methods..."},
190
+ {"infons": {"section_type": "RESULTS"}, "text": "Results..."},
191
+ {"infons": {"section_type": "DISCUSS"}, "text": "Discussion..."},
192
+ ]
193
+ }
194
+ ]
195
+ }
196
+
197
+ respx.get(
198
+ "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/12345678/unicode"
199
+ ).mock(return_value=Response(200, json=mock_bioc))
200
+
201
+ sections = await tool.get_fulltext_structured("12345678")
202
+
203
+ assert sections is not None
204
+ assert "introduction" in sections
205
+ assert "methods" in sections
206
+ assert "results" in sections
207
+ assert "discussion" in sections
208
+
209
+ @respx.mock
210
+ @pytest.mark.asyncio
211
+ async def test_search_with_fulltext_enabled(self) -> None:
212
+ """Search should include full text when tool is configured for it."""
213
+ # Create tool WITH full-text enabled via constructor
214
+ tool = PubMedTool(include_fulltext=True)
215
+
216
+ # Mock esearch
217
+ respx.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi").mock(
218
+ return_value=Response(
219
+ 200, json={"esearchresult": {"idlist": ["12345678"]}}
220
+ )
221
+ )
222
+
223
+ # Mock efetch (abstract)
224
+ mock_xml = """
225
+ <PubmedArticleSet>
226
+ <PubmedArticle>
227
+ <MedlineCitation>
228
+ <PMID>12345678</PMID>
229
+ <Article>
230
+ <ArticleTitle>Test Paper</ArticleTitle>
231
+ <Abstract><AbstractText>Short abstract.</AbstractText></Abstract>
232
+ <AuthorList><Author><LastName>Smith</LastName></Author></AuthorList>
233
+ </Article>
234
+ </MedlineCitation>
235
+ </PubmedArticle>
236
+ </PubmedArticleSet>
237
+ """
238
+ respx.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi").mock(
239
+ return_value=Response(200, text=mock_xml)
240
+ )
241
+
242
+ # Mock ID converter
243
+ respx.get("https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/").mock(
244
+ return_value=Response(
245
+ 200, json={"records": [{"pmid": "12345678", "pmcid": "PMC1234567"}]}
246
+ )
247
+ )
248
+
249
+ # Mock BioC full text
250
+ mock_bioc = {
251
+ "documents": [
252
+ {
253
+ "passages": [
254
+ {"infons": {"section_type": "INTRO"}, "text": "Full intro..."},
255
+ ]
256
+ }
257
+ ]
258
+ }
259
+ respx.get(
260
+ "https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/12345678/unicode"
261
+ ).mock(return_value=Response(200, json=mock_bioc))
262
+
263
+ # NOTE: No include_fulltext param - it's set via constructor
264
+ results = await tool.search("test", max_results=1)
265
+
266
+ assert len(results) == 1
267
+ # Full text should be appended or replace abstract
268
+ assert "Full intro" in results[0].content or "Short abstract" in results[0].content
269
+ ```
270
+
271
+ ---
272
+
273
+ ### Step 2: Implement Full-Text Methods
274
+
275
+ **File**: `src/tools/pubmed.py` (additions to existing class)
276
+
277
+ ```python
278
+ # Add these methods to PubMedTool class
279
+
280
+ async def get_pmc_id(self, pmid: str) -> str | None:
281
+ """
282
+ Convert PMID to PMCID for full-text access.
283
+
284
+ Args:
285
+ pmid: PubMed ID
286
+
287
+ Returns:
288
+ PMCID if paper is in PMC, None otherwise
289
+ """
290
+ url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
291
+ params = {"ids": pmid, "format": "json"}
292
+
293
+ async with httpx.AsyncClient(timeout=30.0) as client:
294
+ try:
295
+ response = await client.get(url, params=params)
296
+ response.raise_for_status()
297
+ data = response.json()
298
+
299
+ records = data.get("records", [])
300
+ if records and records[0].get("pmcid"):
301
+ return records[0]["pmcid"]
302
+ return None
303
+
304
+ except httpx.HTTPError:
305
+ return None
306
+
307
+
308
+ async def get_fulltext(self, pmid: str) -> str | None:
309
+ """
310
+ Get full text for a PubMed paper via BioC API.
311
+
312
+ Only works for open-access papers in PubMed Central.
313
+
314
+ Args:
315
+ pmid: PubMed ID
316
+
317
+ Returns:
318
+ Full text as string, or None if not available
319
+ """
320
+ url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/unicode"
321
+
322
+ async with httpx.AsyncClient(timeout=60.0) as client:
323
+ try:
324
+ response = await client.get(url)
325
+ if response.status_code == 404:
326
+ return None
327
+ response.raise_for_status()
328
+ data = response.json()
329
+
330
+ # Extract text from all passages
331
+ documents = data.get("documents", [])
332
+ if not documents:
333
+ return None
334
+
335
+ passages = documents[0].get("passages", [])
336
+ text_parts = [p.get("text", "") for p in passages if p.get("text")]
337
+
338
+ return "\n\n".join(text_parts) if text_parts else None
339
+
340
+ except httpx.HTTPError:
341
+ return None
342
+
343
+
344
+ async def get_fulltext_structured(self, pmid: str) -> dict[str, str] | None:
345
+ """
346
+ Get structured full text with sections.
347
+
348
+ Args:
349
+ pmid: PubMed ID
350
+
351
+ Returns:
352
+ Dict mapping section names to text, or None if not available
353
+ """
354
+ url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/unicode"
355
+
356
+ async with httpx.AsyncClient(timeout=60.0) as client:
357
+ try:
358
+ response = await client.get(url)
359
+ if response.status_code == 404:
360
+ return None
361
+ response.raise_for_status()
362
+ data = response.json()
363
+
364
+ documents = data.get("documents", [])
365
+ if not documents:
366
+ return None
367
+
368
+ # Map section types to readable names
369
+ section_map = {
370
+ "INTRO": "introduction",
371
+ "METHODS": "methods",
372
+ "RESULTS": "results",
373
+ "DISCUSS": "discussion",
374
+ "CONCL": "conclusion",
375
+ "ABSTRACT": "abstract",
376
+ }
377
+
378
+ sections: dict[str, list[str]] = {}
379
+ for passage in documents[0].get("passages", []):
380
+ section_type = passage.get("infons", {}).get("section_type", "other")
381
+ section_name = section_map.get(section_type, "other")
382
+ text = passage.get("text", "")
383
+
384
+ if text:
385
+ if section_name not in sections:
386
+ sections[section_name] = []
387
+ sections[section_name].append(text)
388
+
389
+ # Join multiple passages per section
390
+ return {k: "\n\n".join(v) for k, v in sections.items()}
391
+
392
+ except httpx.HTTPError:
393
+ return None
394
+ ```
395
+
396
+ ---
397
+
398
+ ### Step 3: Update Constructor and Search Method
399
+
400
+ Add full-text flag to constructor and update search to use it:
401
+
402
+ ```python
403
+ class PubMedTool:
404
+ """Search tool for PubMed/NCBI."""
405
+
406
+ def __init__(
407
+ self,
408
+ api_key: str | None = None,
409
+ include_fulltext: bool = False, # NEW CONSTRUCTOR PARAM
410
+ ) -> None:
411
+ self.api_key = api_key or settings.ncbi_api_key
412
+ if self.api_key == "your-ncbi-key-here":
413
+ self.api_key = None
414
+ self._last_request_time = 0.0
415
+ self.include_fulltext = include_fulltext # Store for use in search()
416
+
417
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
418
+ """
419
+ Search PubMed and return evidence.
420
+
421
+ Note: Full-text enrichment is controlled by constructor parameter,
422
+ not method parameter, because SearchHandler doesn't pass extra args.
423
+ """
424
+ # ... existing search logic ...
425
+
426
+ evidence_list = self._parse_pubmed_xml(fetch_resp.text)
427
+
428
+ # Optionally enrich with full text (if configured at construction)
429
+ if self.include_fulltext:
430
+ evidence_list = await self._enrich_with_fulltext(evidence_list)
431
+
432
+ return evidence_list
433
+
434
+
435
+ async def _enrich_with_fulltext(
436
+ self, evidence_list: list[Evidence]
437
+ ) -> list[Evidence]:
438
+ """Attempt to add full text to evidence items."""
439
+ enriched = []
440
+
441
+ for evidence in evidence_list:
442
+ # Extract PMID from URL
443
+ url = evidence.citation.url
444
+ pmid = url.rstrip("/").split("/")[-1] if url else None
445
+
446
+ if pmid:
447
+ fulltext = await self.get_fulltext(pmid)
448
+ if fulltext:
449
+ # Replace abstract with full text (truncated)
450
+ evidence = Evidence(
451
+ content=fulltext[:8000], # Larger limit for full text
452
+ citation=evidence.citation,
453
+ relevance=evidence.relevance,
454
+ metadata={
455
+ **evidence.metadata,
456
+ "has_fulltext": True,
457
+ },
458
+ )
459
+
460
+ enriched.append(evidence)
461
+
462
+ return enriched
463
+ ```
464
+
465
+ ---
466
+
467
+ ## Demo Script
468
+
469
+ **File**: `examples/pubmed_fulltext_demo.py`
470
+
471
+ ```python
472
+ #!/usr/bin/env python3
473
+ """Demo script to verify PubMed full-text retrieval."""
474
+
475
+ import asyncio
476
+ from src.tools.pubmed import PubMedTool
477
+
478
+
479
+ async def main():
480
+ """Run PubMed full-text demo."""
481
+ tool = PubMedTool()
482
+
483
+ print("=" * 60)
484
+ print("PubMed Full-Text Demo")
485
+ print("=" * 60)
486
+
487
+ # Test 1: Convert PMID to PMCID
488
+ print("\n[Test 1] Converting PMID to PMCID...")
489
+ # Use a known open-access paper
490
+ test_pmid = "34450029" # Example: COVID-related open-access paper
491
+ pmcid = await tool.get_pmc_id(test_pmid)
492
+ print(f"PMID {test_pmid} -> PMCID: {pmcid or 'Not in PMC'}")
493
+
494
+ # Test 2: Get full text
495
+ print("\n[Test 2] Fetching full text...")
496
+ if pmcid:
497
+ fulltext = await tool.get_fulltext(test_pmid)
498
+ if fulltext:
499
+ print(f"Full text length: {len(fulltext)} characters")
500
+ print(f"Preview: {fulltext[:500]}...")
501
+ else:
502
+ print("Full text not available")
503
+
504
+ # Test 3: Get structured sections
505
+ print("\n[Test 3] Fetching structured sections...")
506
+ if pmcid:
507
+ sections = await tool.get_fulltext_structured(test_pmid)
508
+ if sections:
509
+ print("Available sections:")
510
+ for section, text in sections.items():
511
+ print(f" - {section}: {len(text)} chars")
512
+ else:
513
+ print("Structured text not available")
514
+
515
+ # Test 4: Search with full text
516
+ print("\n[Test 4] Search with full-text enrichment...")
517
+ results = await tool.search(
518
+ "metformin cancer open access",
519
+ max_results=3,
520
+ include_fulltext=True
521
+ )
522
+
523
+ for i, evidence in enumerate(results, 1):
524
+ has_ft = evidence.metadata.get("has_fulltext", False)
525
+ print(f"\n--- Result {i} ---")
526
+ print(f"Title: {evidence.citation.title}")
527
+ print(f"Has Full Text: {has_ft}")
528
+ print(f"Content Length: {len(evidence.content)} chars")
529
+
530
+ print("\n" + "=" * 60)
531
+ print("Demo complete!")
532
+
533
+
534
+ if __name__ == "__main__":
535
+ asyncio.run(main())
536
+ ```
537
+
538
+ ---
539
+
540
+ ## Verification Checklist
541
+
542
+ ### Unit Tests
543
+ ```bash
544
+ # Run full-text tests
545
+ uv run pytest tests/unit/tools/test_pubmed_fulltext.py -v
546
+
547
+ # Run all PubMed tests
548
+ uv run pytest tests/unit/tools/test_pubmed.py -v
549
+
550
+ # Expected: All tests pass
551
+ ```
552
+
553
+ ### Integration Test (Manual)
554
+ ```bash
555
+ # Run demo with real API
556
+ uv run python examples/pubmed_fulltext_demo.py
557
+
558
+ # Expected: Real full text from PMC papers
559
+ ```
560
+
561
+ ### Full Test Suite
562
+ ```bash
563
+ make check
564
+ # Expected: All tests pass, mypy clean
565
+ ```
566
+
567
+ ---
568
+
569
+ ## Success Criteria
570
+
571
+ 1. **ID Conversion works**: PMID -> PMCID conversion successful
572
+ 2. **Full text retrieval works**: BioC API returns paper text
573
+ 3. **Structured sections work**: Can get intro/methods/results/discussion separately
574
+ 4. **Search integration works**: `include_fulltext=True` enriches results
575
+ 5. **No regressions**: Existing tests still pass
576
+ 6. **Graceful degradation**: Non-PMC papers still return abstracts
577
+
578
+ ---
579
+
580
+ ## Notes
581
+
582
+ - Only ~30% of PubMed papers have full text in PMC
583
+ - BioC API has no documented rate limit, but be respectful
584
+ - Full text can be very long - truncate appropriately
585
+ - Consider caching full text responses (they don't change)
586
+ - Timeout should be longer for full text (60s vs 30s)
docs/brainstorming/implementation/17_PHASE_RATE_LIMITING.md ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 17: Rate Limiting with `limits` Library
2
+
3
+ **Priority**: P0 CRITICAL - Prevents API blocks
4
+ **Effort**: ~1 hour
5
+ **Dependencies**: None
6
+
7
+ ---
8
+
9
+ ## CRITICAL: Async Safety Requirements
10
+
11
+ **WARNING**: The rate limiter MUST be async-safe. Blocking the event loop will freeze:
12
+ - The Gradio UI
13
+ - All parallel searches
14
+ - The orchestrator
15
+
16
+ **Rules**:
17
+ 1. **NEVER use `time.sleep()`** - Always use `await asyncio.sleep()`
18
+ 2. **NEVER use blocking while loops** - Use async-aware polling
19
+ 3. **The `limits` library check is synchronous** - Wrap it carefully
20
+
21
+ The implementation below uses a polling pattern that:
22
+ - Checks the limit (synchronous, fast)
23
+ - If exceeded, `await asyncio.sleep()` (non-blocking)
24
+ - Retry the check
25
+
26
+ **Alternative**: If `limits` proves problematic, use `aiolimiter` which is pure-async.
27
+
28
+ ---
29
+
30
+ ## Overview
31
+
32
+ Replace naive `asyncio.sleep` rate limiting with proper rate limiter using the `limits` library, which provides:
33
+ - Moving window rate limiting
34
+ - Per-API configurable limits
35
+ - Thread-safe storage
36
+ - Already used in reference repo
37
+
38
+ **Why This Matters?**
39
+ - NCBI will block us without proper rate limiting (3/sec without key, 10/sec with)
40
+ - Current implementation only has simple sleep delay
41
+ - Need coordinated limits across all PubMed calls
42
+ - Professional-grade rate limiting prevents production issues
43
+
44
+ ---
45
+
46
+ ## Current State
47
+
48
+ ### What We Have (`src/tools/pubmed.py:20-21, 34-41`)
49
+
50
+ ```python
51
+ RATE_LIMIT_DELAY = 0.34 # ~3 requests/sec without API key
52
+
53
+ async def _rate_limit(self) -> None:
54
+ """Enforce NCBI rate limiting."""
55
+ loop = asyncio.get_running_loop()
56
+ now = loop.time()
57
+ elapsed = now - self._last_request_time
58
+ if elapsed < self.RATE_LIMIT_DELAY:
59
+ await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed)
60
+ self._last_request_time = loop.time()
61
+ ```
62
+
63
+ ### Problems
64
+
65
+ 1. **Not shared across instances**: Each `PubMedTool()` has its own counter
66
+ 2. **Simple delay vs moving window**: Doesn't handle bursts properly
67
+ 3. **Hardcoded rate**: Doesn't adapt to API key presence
68
+ 4. **No backoff on 429**: Just retries blindly
69
+
70
+ ---
71
+
72
+ ## TDD Implementation Plan
73
+
74
+ ### Step 1: Add Dependency
75
+
76
+ **File**: `pyproject.toml`
77
+
78
+ ```toml
79
+ dependencies = [
80
+ # ... existing deps ...
81
+ "limits>=3.0",
82
+ ]
83
+ ```
84
+
85
+ Then run:
86
+ ```bash
87
+ uv sync
88
+ ```
89
+
90
+ ---
91
+
92
+ ### Step 2: Write the Tests First
93
+
94
+ **File**: `tests/unit/tools/test_rate_limiting.py`
95
+
96
+ ```python
97
+ """Tests for rate limiting functionality."""
98
+
99
+ import asyncio
100
+ import time
101
+
102
+ import pytest
103
+
104
+ from src.tools.rate_limiter import RateLimiter, get_pubmed_limiter
105
+
106
+
107
+ class TestRateLimiter:
108
+ """Test suite for rate limiter."""
109
+
110
+ def test_create_limiter_without_api_key(self) -> None:
111
+ """Should create 3/sec limiter without API key."""
112
+ limiter = RateLimiter(rate="3/second")
113
+ assert limiter.rate == "3/second"
114
+
115
+ def test_create_limiter_with_api_key(self) -> None:
116
+ """Should create 10/sec limiter with API key."""
117
+ limiter = RateLimiter(rate="10/second")
118
+ assert limiter.rate == "10/second"
119
+
120
+ @pytest.mark.asyncio
121
+ async def test_limiter_allows_requests_under_limit(self) -> None:
122
+ """Should allow requests under the rate limit."""
123
+ limiter = RateLimiter(rate="10/second")
124
+
125
+ # 3 requests should all succeed immediately
126
+ for _ in range(3):
127
+ allowed = await limiter.acquire()
128
+ assert allowed is True
129
+
130
+ @pytest.mark.asyncio
131
+ async def test_limiter_blocks_when_exceeded(self) -> None:
132
+ """Should wait when rate limit exceeded."""
133
+ limiter = RateLimiter(rate="2/second")
134
+
135
+ # First 2 should be instant
136
+ await limiter.acquire()
137
+ await limiter.acquire()
138
+
139
+ # Third should block briefly
140
+ start = time.monotonic()
141
+ await limiter.acquire()
142
+ elapsed = time.monotonic() - start
143
+
144
+ # Should have waited ~0.5 seconds (half second window for 2/sec)
145
+ assert elapsed >= 0.3
146
+
147
+ @pytest.mark.asyncio
148
+ async def test_limiter_resets_after_window(self) -> None:
149
+ """Rate limit should reset after time window."""
150
+ limiter = RateLimiter(rate="5/second")
151
+
152
+ # Use up the limit
153
+ for _ in range(5):
154
+ await limiter.acquire()
155
+
156
+ # Wait for window to pass
157
+ await asyncio.sleep(1.1)
158
+
159
+ # Should be allowed again
160
+ start = time.monotonic()
161
+ await limiter.acquire()
162
+ elapsed = time.monotonic() - start
163
+
164
+ assert elapsed < 0.1 # Should be nearly instant
165
+
166
+
167
+ class TestGetPubmedLimiter:
168
+ """Test PubMed-specific limiter factory."""
169
+
170
+ def test_limiter_without_api_key(self) -> None:
171
+ """Should return 3/sec limiter without key."""
172
+ limiter = get_pubmed_limiter(api_key=None)
173
+ assert "3" in limiter.rate
174
+
175
+ def test_limiter_with_api_key(self) -> None:
176
+ """Should return 10/sec limiter with key."""
177
+ limiter = get_pubmed_limiter(api_key="my-api-key")
178
+ assert "10" in limiter.rate
179
+
180
+ def test_limiter_is_singleton(self) -> None:
181
+ """Same API key should return same limiter instance."""
182
+ limiter1 = get_pubmed_limiter(api_key="key1")
183
+ limiter2 = get_pubmed_limiter(api_key="key1")
184
+ assert limiter1 is limiter2
185
+
186
+ def test_different_keys_different_limiters(self) -> None:
187
+ """Different API keys should return different limiters."""
188
+ limiter1 = get_pubmed_limiter(api_key="key1")
189
+ limiter2 = get_pubmed_limiter(api_key="key2")
190
+ # Clear cache for clean test
191
+ # Actually, different keys SHOULD share the same limiter
192
+ # since we're limiting against the same API
193
+ assert limiter1 is limiter2 # Shared NCBI rate limit
194
+ ```
195
+
196
+ ---
197
+
198
+ ### Step 3: Create Rate Limiter Module
199
+
200
+ **File**: `src/tools/rate_limiter.py`
201
+
202
+ ```python
203
+ """Rate limiting utilities using the limits library."""
204
+
205
+ import asyncio
206
+ from typing import ClassVar
207
+
208
+ from limits import RateLimitItem, parse
209
+ from limits.storage import MemoryStorage
210
+ from limits.strategies import MovingWindowRateLimiter
211
+
212
+
213
+ class RateLimiter:
214
+ """
215
+ Async-compatible rate limiter using limits library.
216
+
217
+ Uses moving window algorithm for smooth rate limiting.
218
+ """
219
+
220
+ def __init__(self, rate: str) -> None:
221
+ """
222
+ Initialize rate limiter.
223
+
224
+ Args:
225
+ rate: Rate string like "3/second" or "10/second"
226
+ """
227
+ self.rate = rate
228
+ self._storage = MemoryStorage()
229
+ self._limiter = MovingWindowRateLimiter(self._storage)
230
+ self._rate_limit: RateLimitItem = parse(rate)
231
+ self._identity = "default" # Single identity for shared limiting
232
+
233
+ async def acquire(self, wait: bool = True) -> bool:
234
+ """
235
+ Acquire permission to make a request.
236
+
237
+ ASYNC-SAFE: Uses asyncio.sleep(), never time.sleep().
238
+ The polling pattern allows other coroutines to run while waiting.
239
+
240
+ Args:
241
+ wait: If True, wait until allowed. If False, return immediately.
242
+
243
+ Returns:
244
+ True if allowed, False if not (only when wait=False)
245
+ """
246
+ while True:
247
+ # Check if we can proceed (synchronous, fast - ~microseconds)
248
+ if self._limiter.hit(self._rate_limit, self._identity):
249
+ return True
250
+
251
+ if not wait:
252
+ return False
253
+
254
+ # CRITICAL: Use asyncio.sleep(), NOT time.sleep()
255
+ # This yields control to the event loop, allowing other
256
+ # coroutines (UI, parallel searches) to run
257
+ await asyncio.sleep(0.1)
258
+
259
+ def reset(self) -> None:
260
+ """Reset the rate limiter (for testing)."""
261
+ self._storage.reset()
262
+
263
+
264
+ # Singleton limiter for PubMed/NCBI
265
+ _pubmed_limiter: RateLimiter | None = None
266
+
267
+
268
+ def get_pubmed_limiter(api_key: str | None = None) -> RateLimiter:
269
+ """
270
+ Get the shared PubMed rate limiter.
271
+
272
+ Rate depends on whether API key is provided:
273
+ - Without key: 3 requests/second
274
+ - With key: 10 requests/second
275
+
276
+ Args:
277
+ api_key: NCBI API key (optional)
278
+
279
+ Returns:
280
+ Shared RateLimiter instance
281
+ """
282
+ global _pubmed_limiter
283
+
284
+ if _pubmed_limiter is None:
285
+ rate = "10/second" if api_key else "3/second"
286
+ _pubmed_limiter = RateLimiter(rate)
287
+
288
+ return _pubmed_limiter
289
+
290
+
291
+ def reset_pubmed_limiter() -> None:
292
+ """Reset the PubMed limiter (for testing)."""
293
+ global _pubmed_limiter
294
+ _pubmed_limiter = None
295
+
296
+
297
+ # Factory for other APIs
298
+ class RateLimiterFactory:
299
+ """Factory for creating/getting rate limiters for different APIs."""
300
+
301
+ _limiters: ClassVar[dict[str, RateLimiter]] = {}
302
+
303
+ @classmethod
304
+ def get(cls, api_name: str, rate: str) -> RateLimiter:
305
+ """
306
+ Get or create a rate limiter for an API.
307
+
308
+ Args:
309
+ api_name: Unique identifier for the API
310
+ rate: Rate limit string (e.g., "10/second")
311
+
312
+ Returns:
313
+ RateLimiter instance (shared for same api_name)
314
+ """
315
+ if api_name not in cls._limiters:
316
+ cls._limiters[api_name] = RateLimiter(rate)
317
+ return cls._limiters[api_name]
318
+
319
+ @classmethod
320
+ def reset_all(cls) -> None:
321
+ """Reset all limiters (for testing)."""
322
+ cls._limiters.clear()
323
+ ```
324
+
325
+ ---
326
+
327
+ ### Step 4: Update PubMed Tool
328
+
329
+ **File**: `src/tools/pubmed.py` (replace rate limiting code)
330
+
331
+ ```python
332
+ # Replace imports and rate limiting
333
+
334
+ from src.tools.rate_limiter import get_pubmed_limiter
335
+
336
+
337
+ class PubMedTool:
338
+ """Search tool for PubMed/NCBI."""
339
+
340
+ BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
341
+ HTTP_TOO_MANY_REQUESTS = 429
342
+
343
+ def __init__(self, api_key: str | None = None) -> None:
344
+ self.api_key = api_key or settings.ncbi_api_key
345
+ if self.api_key == "your-ncbi-key-here":
346
+ self.api_key = None
347
+ # Use shared rate limiter
348
+ self._limiter = get_pubmed_limiter(self.api_key)
349
+
350
+ async def _rate_limit(self) -> None:
351
+ """Enforce NCBI rate limiting using shared limiter."""
352
+ await self._limiter.acquire()
353
+
354
+ # ... rest of class unchanged ...
355
+ ```
356
+
357
+ ---
358
+
359
+ ### Step 5: Add Rate Limiters for Other APIs
360
+
361
+ **File**: `src/tools/clinicaltrials.py` (optional)
362
+
363
+ ```python
364
+ from src.tools.rate_limiter import RateLimiterFactory
365
+
366
+
367
+ class ClinicalTrialsTool:
368
+ def __init__(self) -> None:
369
+ # ClinicalTrials.gov doesn't document limits, but be conservative
370
+ self._limiter = RateLimiterFactory.get("clinicaltrials", "5/second")
371
+
372
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
373
+ await self._limiter.acquire()
374
+ # ... rest of method ...
375
+ ```
376
+
377
+ **File**: `src/tools/europepmc.py` (optional)
378
+
379
+ ```python
380
+ from src.tools.rate_limiter import RateLimiterFactory
381
+
382
+
383
+ class EuropePMCTool:
384
+ def __init__(self) -> None:
385
+ # Europe PMC is generous, but still be respectful
386
+ self._limiter = RateLimiterFactory.get("europepmc", "10/second")
387
+
388
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
389
+ await self._limiter.acquire()
390
+ # ... rest of method ...
391
+ ```
392
+
393
+ ---
394
+
395
+ ## Demo Script
396
+
397
+ **File**: `examples/rate_limiting_demo.py`
398
+
399
+ ```python
400
+ #!/usr/bin/env python3
401
+ """Demo script to verify rate limiting works correctly."""
402
+
403
+ import asyncio
404
+ import time
405
+
406
+ from src.tools.rate_limiter import RateLimiter, get_pubmed_limiter, reset_pubmed_limiter
407
+ from src.tools.pubmed import PubMedTool
408
+
409
+
410
+ async def test_basic_limiter():
411
+ """Test basic rate limiter behavior."""
412
+ print("=" * 60)
413
+ print("Rate Limiting Demo")
414
+ print("=" * 60)
415
+
416
+ # Test 1: Basic limiter
417
+ print("\n[Test 1] Testing 3/second limiter...")
418
+ limiter = RateLimiter("3/second")
419
+
420
+ start = time.monotonic()
421
+ for i in range(6):
422
+ await limiter.acquire()
423
+ elapsed = time.monotonic() - start
424
+ print(f" Request {i+1} at {elapsed:.2f}s")
425
+
426
+ total = time.monotonic() - start
427
+ print(f" Total time for 6 requests: {total:.2f}s (expected ~2s)")
428
+
429
+
430
+ async def test_pubmed_limiter():
431
+ """Test PubMed-specific limiter."""
432
+ print("\n[Test 2] Testing PubMed limiter (shared)...")
433
+
434
+ reset_pubmed_limiter() # Clean state
435
+
436
+ # Without API key: 3/sec
437
+ limiter = get_pubmed_limiter(api_key=None)
438
+ print(f" Rate without key: {limiter.rate}")
439
+
440
+ # Multiple tools should share the same limiter
441
+ tool1 = PubMedTool()
442
+ tool2 = PubMedTool()
443
+
444
+ # Verify they share the limiter
445
+ print(f" Tools share limiter: {tool1._limiter is tool2._limiter}")
446
+
447
+
448
+ async def test_concurrent_requests():
449
+ """Test rate limiting under concurrent load."""
450
+ print("\n[Test 3] Testing concurrent request limiting...")
451
+
452
+ limiter = RateLimiter("5/second")
453
+
454
+ async def make_request(i: int):
455
+ await limiter.acquire()
456
+ return time.monotonic()
457
+
458
+ start = time.monotonic()
459
+ # Launch 10 concurrent requests
460
+ tasks = [make_request(i) for i in range(10)]
461
+ times = await asyncio.gather(*tasks)
462
+
463
+ # Calculate distribution
464
+ relative_times = [t - start for t in times]
465
+ print(f" Request times: {[f'{t:.2f}s' for t in sorted(relative_times)]}")
466
+
467
+ total = max(relative_times)
468
+ print(f" All 10 requests completed in {total:.2f}s (expected ~2s)")
469
+
470
+
471
+ async def main():
472
+ await test_basic_limiter()
473
+ await test_pubmed_limiter()
474
+ await test_concurrent_requests()
475
+
476
+ print("\n" + "=" * 60)
477
+ print("Demo complete!")
478
+
479
+
480
+ if __name__ == "__main__":
481
+ asyncio.run(main())
482
+ ```
483
+
484
+ ---
485
+
486
+ ## Verification Checklist
487
+
488
+ ### Unit Tests
489
+ ```bash
490
+ # Run rate limiting tests
491
+ uv run pytest tests/unit/tools/test_rate_limiting.py -v
492
+
493
+ # Expected: All tests pass
494
+ ```
495
+
496
+ ### Integration Test (Manual)
497
+ ```bash
498
+ # Run demo
499
+ uv run python examples/rate_limiting_demo.py
500
+
501
+ # Expected: Requests properly spaced
502
+ ```
503
+
504
+ ### Full Test Suite
505
+ ```bash
506
+ make check
507
+ # Expected: All tests pass, mypy clean
508
+ ```
509
+
510
+ ---
511
+
512
+ ## Success Criteria
513
+
514
+ 1. **`limits` library installed**: Dependency added to pyproject.toml
515
+ 2. **RateLimiter class works**: Can create and use limiters
516
+ 3. **PubMed uses new limiter**: Shared limiter across instances
517
+ 4. **Rate adapts to API key**: 3/sec without, 10/sec with
518
+ 5. **Concurrent requests handled**: Multiple async requests properly queued
519
+ 6. **No regressions**: All existing tests pass
520
+
521
+ ---
522
+
523
+ ## API Rate Limit Reference
524
+
525
+ | API | Without Key | With Key |
526
+ |-----|-------------|----------|
527
+ | PubMed/NCBI | 3/sec | 10/sec |
528
+ | ClinicalTrials.gov | Undocumented (~5/sec safe) | N/A |
529
+ | Europe PMC | ~10-20/sec (generous) | N/A |
530
+ | OpenAlex | ~100k/day (no per-sec limit) | Faster with `mailto` |
531
+
532
+ ---
533
+
534
+ ## Notes
535
+
536
+ - `limits` library uses moving window algorithm (fairer than fixed window)
537
+ - Singleton pattern ensures all PubMed calls share the limit
538
+ - The factory pattern allows easy extension to other APIs
539
+ - Consider adding 429 response detection + exponential backoff
540
+ - In production, consider Redis storage for distributed rate limiting
docs/brainstorming/implementation/README.md ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Implementation Plans
2
+
3
+ TDD implementation plans based on the brainstorming documents. Each phase is a self-contained vertical slice with tests, implementation, and demo scripts.
4
+
5
+ ---
6
+
7
+ ## Prerequisites (COMPLETED)
8
+
9
+ The following foundational changes have been implemented to support all three phases:
10
+
11
+ | Change | File | Status |
12
+ |--------|------|--------|
13
+ | Add `"openalex"` to `SourceName` | `src/utils/models.py:9` | ✅ Done |
14
+ | Add `metadata` field to `Evidence` | `src/utils/models.py:39-42` | ✅ Done |
15
+ | Export all tools from `__init__.py` | `src/tools/__init__.py` | ✅ Done |
16
+
17
+ All 110 tests pass after these changes.
18
+
19
+ ---
20
+
21
+ ## Priority Order
22
+
23
+ | Phase | Name | Priority | Effort | Value |
24
+ |-------|------|----------|--------|-------|
25
+ | **17** | Rate Limiting | P0 CRITICAL | 1 hour | Stability |
26
+ | **15** | OpenAlex | HIGH | 2-3 hours | Very High |
27
+ | **16** | PubMed Full-Text | MEDIUM | 3 hours | High |
28
+
29
+ **Recommended implementation order**: 17 → 15 → 16
30
+
31
+ ---
32
+
33
+ ## Phase 15: OpenAlex Integration
34
+
35
+ **File**: [15_PHASE_OPENALEX.md](./15_PHASE_OPENALEX.md)
36
+
37
+ Add OpenAlex as 4th data source for:
38
+ - Citation networks (who cites whom)
39
+ - Concept tagging (semantic discovery)
40
+ - 209M+ scholarly works
41
+ - Free, no API key required
42
+
43
+ **Quick Start**:
44
+ ```bash
45
+ # Create the tool
46
+ touch src/tools/openalex.py
47
+ touch tests/unit/tools/test_openalex.py
48
+
49
+ # Run tests first (TDD)
50
+ uv run pytest tests/unit/tools/test_openalex.py -v
51
+
52
+ # Demo
53
+ uv run python examples/openalex_demo.py
54
+ ```
55
+
56
+ ---
57
+
58
+ ## Phase 16: PubMed Full-Text
59
+
60
+ **File**: [16_PHASE_PUBMED_FULLTEXT.md](./16_PHASE_PUBMED_FULLTEXT.md)
61
+
62
+ Add full-text retrieval via BioC API for:
63
+ - Complete paper text (not just abstracts)
64
+ - Structured sections (intro, methods, results)
65
+ - Better evidence for LLM synthesis
66
+
67
+ **Quick Start**:
68
+ ```bash
69
+ # Add methods to existing pubmed.py
70
+ # Tests in test_pubmed_fulltext.py
71
+
72
+ # Run tests
73
+ uv run pytest tests/unit/tools/test_pubmed_fulltext.py -v
74
+
75
+ # Demo
76
+ uv run python examples/pubmed_fulltext_demo.py
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Phase 17: Rate Limiting
82
+
83
+ **File**: [17_PHASE_RATE_LIMITING.md](./17_PHASE_RATE_LIMITING.md)
84
+
85
+ Replace naive sleep-based rate limiting with `limits` library for:
86
+ - Moving window algorithm
87
+ - Shared limits across instances
88
+ - Configurable per-API rates
89
+ - Production-grade stability
90
+
91
+ **Quick Start**:
92
+ ```bash
93
+ # Add dependency
94
+ uv add limits
95
+
96
+ # Create module
97
+ touch src/tools/rate_limiter.py
98
+ touch tests/unit/tools/test_rate_limiting.py
99
+
100
+ # Run tests
101
+ uv run pytest tests/unit/tools/test_rate_limiting.py -v
102
+
103
+ # Demo
104
+ uv run python examples/rate_limiting_demo.py
105
+ ```
106
+
107
+ ---
108
+
109
+ ## TDD Workflow
110
+
111
+ Each implementation doc follows this pattern:
112
+
113
+ 1. **Write tests first** - Define expected behavior
114
+ 2. **Run tests** - Verify they fail (red)
115
+ 3. **Implement** - Write minimal code to pass
116
+ 4. **Run tests** - Verify they pass (green)
117
+ 5. **Refactor** - Clean up if needed
118
+ 6. **Demo** - Verify end-to-end with real APIs
119
+ 7. **`make check`** - Ensure no regressions
120
+
121
+ ---
122
+
123
+ ## Related Brainstorming Docs
124
+
125
+ These implementation plans are derived from:
126
+
127
+ - [00_ROADMAP_SUMMARY.md](../00_ROADMAP_SUMMARY.md) - Priority overview
128
+ - [01_PUBMED_IMPROVEMENTS.md](../01_PUBMED_IMPROVEMENTS.md) - PubMed details
129
+ - [02_CLINICALTRIALS_IMPROVEMENTS.md](../02_CLINICALTRIALS_IMPROVEMENTS.md) - CT.gov details
130
+ - [03_EUROPEPMC_IMPROVEMENTS.md](../03_EUROPEPMC_IMPROVEMENTS.md) - Europe PMC details
131
+ - [04_OPENALEX_INTEGRATION.md](../04_OPENALEX_INTEGRATION.md) - OpenAlex integration
132
+
133
+ ---
134
+
135
+ ## Future Phases (Not Yet Documented)
136
+
137
+ Based on brainstorming, these could be added later:
138
+
139
+ - **Phase 18**: ClinicalTrials.gov Results Retrieval
140
+ - **Phase 19**: Europe PMC Annotations API
141
+ - **Phase 20**: Drug Name Normalization (RxNorm)
142
+ - **Phase 21**: Citation Network Queries (OpenAlex)
143
+ - **Phase 22**: Semantic Search with Embeddings
docs/brainstorming/magentic-pydantic/00_SITUATION_AND_PLAN.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Situation Analysis: Pydantic-AI + Microsoft Agent Framework Integration
2
+
3
+ **Date:** November 27, 2025
4
+ **Status:** ACTIVE DECISION REQUIRED
5
+ **Risk Level:** HIGH - DO NOT MERGE PR #41 UNTIL RESOLVED
6
+
7
+ ---
8
+
9
+ ## 1. The Problem
10
+
11
+ We almost merged a refactor that would have **deleted** multi-agent orchestration capability from the codebase, mistakenly believing pydantic-ai and Microsoft Agent Framework were mutually exclusive.
12
+
13
+ **They are not.** They are complementary:
14
+ - **pydantic-ai** (Library): Ensures LLM outputs match Pydantic schemas
15
+ - **Microsoft Agent Framework** (Framework): Orchestrates multi-agent workflows
16
+
17
+ ---
18
+
19
+ ## 2. Current Branch State
20
+
21
+ | Branch | Location | Has Agent Framework? | Has Pydantic-AI Improvements? | Status |
22
+ |--------|----------|---------------------|------------------------------|--------|
23
+ | `origin/dev` | GitHub | YES | NO | **SAFE - Source of Truth** |
24
+ | `huggingface-upstream/dev` | HF Spaces | YES | NO | **SAFE - Same as GitHub** |
25
+ | `origin/main` | GitHub | YES | NO | **SAFE** |
26
+ | `feat/pubmed-fulltext` | GitHub | NO (deleted) | YES | **DANGER - Has destructive refactor** |
27
+ | `refactor/pydantic-unification` | Local | NO (deleted) | YES | **DANGER - Redundant, delete** |
28
+ | Local `dev` | Local only | NO (deleted) | YES | **DANGER - NOT PUSHED (thankfully)** |
29
+
30
+ ### Key Files at Risk
31
+
32
+ **On `origin/dev` (PRESERVED):**
33
+ ```text
34
+ src/agents/
35
+ ├── analysis_agent.py # StatisticalAnalyzer wrapper
36
+ ├── hypothesis_agent.py # Hypothesis generation
37
+ ├── judge_agent.py # JudgeHandler wrapper
38
+ ├── magentic_agents.py # Multi-agent definitions
39
+ ├── report_agent.py # Report synthesis
40
+ ├── search_agent.py # SearchHandler wrapper
41
+ ├── state.py # Thread-safe state management
42
+ └── tools.py # @ai_function decorated tools
43
+
44
+ src/orchestrator_magentic.py # Multi-agent orchestrator
45
+ src/utils/llm_factory.py # Centralized LLM client factory
46
+ ```
47
+
48
+ **Deleted in refactor branch (would be lost if merged):**
49
+ - All of the above
50
+
51
+ ---
52
+
53
+ ## 3. Target Architecture
54
+
55
+ ```text
56
+ ┌─────────────────────────────────────────────────────────────────┐
57
+ │ Microsoft Agent Framework (Orchestration Layer) │
58
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
59
+ │ │ SearchAgent │→ │ JudgeAgent │→ │ ReportAgent │ │
60
+ │ │ (BaseAgent) │ │ (BaseAgent) │ │ (BaseAgent) │ │
61
+ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │
62
+ │ │ │ │ │
63
+ │ ▼ ▼ ▼ │
64
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
65
+ │ │ pydantic-ai │ │ pydantic-ai │ │ pydantic-ai │ │
66
+ │ │ Agent() │ │ Agent() │ │ Agent() │ │
67
+ │ │ output_type= │ │ output_type= │ │ output_type= │ │
68
+ │ │ SearchResult │ │ JudgeAssess │ │ Report │ │
69
+ │ └──────────────┘ └──────────────┘ └──────────────┘ │
70
+ └─────────────────────────────────────────────────────────────────┘
71
+ ```
72
+
73
+ **Why this architecture:**
74
+ 1. **Agent Framework** handles: workflow coordination, state passing, middleware, observability
75
+ 2. **pydantic-ai** handles: type-safe LLM calls within each agent
76
+
77
+ ---
78
+
79
+ ## 4. CRITICAL: Naming Confusion Clarification
80
+
81
+ > **Senior Agent Review Finding:** The codebase uses "magentic" in file names (e.g., `orchestrator_magentic.py`, `magentic_agents.py`) but this is **NOT** the `magentic` PyPI package by Jacky Liang. It's Microsoft Agent Framework (`agent-framework-core`).
82
+
83
+ **The naming confusion:**
84
+ - `magentic` (PyPI package): A different library for structured LLM outputs
85
+ - "Magentic" (in our codebase): Our internal name for Microsoft Agent Framework integration
86
+ - `agent-framework-core` (PyPI package): Microsoft's actual multi-agent orchestration framework
87
+
88
+ **Recommended future action:** Rename `orchestrator_magentic.py` → `orchestrator_advanced.py` to eliminate confusion.
89
+
90
+ ---
91
+
92
+ ## 5. What the Refactor DID Get Right
93
+
94
+ The refactor branch (`feat/pubmed-fulltext`) has some valuable improvements:
95
+
96
+ 1. **`judges.py` unified `get_model()`** - Supports OpenAI, Anthropic, AND HuggingFace via pydantic-ai
97
+ 2. **HuggingFace free tier support** - `HuggingFaceModel` integration
98
+ 3. **Test fix** - Properly mocks `HuggingFaceModel` class
99
+ 4. **Removed broken magentic optional dependency** from pyproject.toml (this was correct - the old `magentic` package is different from Microsoft Agent Framework)
100
+
101
+ **What it got WRONG:**
102
+ 1. Deleted `src/agents/` entirely instead of refactoring them
103
+ 2. Deleted `src/orchestrator_magentic.py` instead of fixing it
104
+ 3. Conflated "magentic" (old package) with "Microsoft Agent Framework" (current framework)
105
+
106
+ ---
107
+
108
+ ## 6. Options for Path Forward
109
+
110
+ ### Option A: Abandon Refactor, Start Fresh
111
+ - Close PR #41
112
+ - Delete `feat/pubmed-fulltext` and `refactor/pydantic-unification` branches
113
+ - Reset local `dev` to match `origin/dev`
114
+ - Cherry-pick ONLY the good parts (judges.py improvements, HF support)
115
+ - **Pros:** Clean, safe
116
+ - **Cons:** Lose some work, need to redo carefully
117
+
118
+ ### Option B: Cherry-Pick Good Parts to origin/dev
119
+ - Do NOT merge PR #41
120
+ - Create new branch from `origin/dev`
121
+ - Cherry-pick specific commits/changes that improve pydantic-ai usage
122
+ - Keep agent framework code intact
123
+ - **Pros:** Preserves both, surgical
124
+ - **Cons:** Requires careful file-by-file review
125
+
126
+ ### Option C: Revert Deletions in Refactor Branch
127
+ - On `feat/pubmed-fulltext`, restore deleted agent files from `origin/dev`
128
+ - Keep the pydantic-ai improvements
129
+ - Merge THAT to dev
130
+ - **Pros:** Gets both
131
+ - **Cons:** Complex git operations, risk of conflicts
132
+
133
+ ---
134
+
135
+ ## 7. Recommended Action: Option B (Cherry-Pick)
136
+
137
+ **Step-by-step:**
138
+
139
+ 1. **Close PR #41** (do not merge)
140
+ 2. **Delete redundant branches:**
141
+ - `refactor/pydantic-unification` (local)
142
+ - Reset local `dev` to `origin/dev`
143
+ 3. **Create new branch from origin/dev:**
144
+ ```bash
145
+ git checkout -b feat/pydantic-ai-improvements origin/dev
146
+ ```
147
+ 4. **Cherry-pick or manually port these improvements:**
148
+ - `src/agent_factory/judges.py` - the unified `get_model()` function
149
+ - `examples/free_tier_demo.py` - HuggingFace demo
150
+ - Test improvements
151
+ 5. **Do NOT delete any agent framework files**
152
+ 6. **Create PR for review**
153
+
154
+ ---
155
+
156
+ ## 8. Files to Cherry-Pick (Safe Improvements)
157
+
158
+ | File | What Changed | Safe to Port? |
159
+ |------|-------------|---------------|
160
+ | `src/agent_factory/judges.py` | Added `HuggingFaceModel` support in `get_model()` | YES |
161
+ | `examples/free_tier_demo.py` | New demo for HF inference | YES |
162
+ | `tests/unit/agent_factory/test_judges.py` | Fixed HF model mocking | YES |
163
+ | `pyproject.toml` | Removed old `magentic` optional dep | MAYBE (review carefully) |
164
+
165
+ ---
166
+
167
+ ## 9. Questions to Answer Before Proceeding
168
+
169
+ 1. **For the hackathon**: Do we need full multi-agent orchestration, or is single-agent sufficient?
170
+ 2. **For DeepCritical mainline**: Is the plan to use Microsoft Agent Framework for orchestration?
171
+ 3. **Timeline**: How much time do we have to get this right?
172
+
173
+ ---
174
+
175
+ ## 10. Immediate Actions (DO NOW)
176
+
177
+ - [ ] **DO NOT merge PR #41**
178
+ - [ ] Close PR #41 with comment explaining the situation
179
+ - [ ] Do not push local `dev` branch anywhere
180
+ - [ ] Confirm HuggingFace Spaces is untouched (it is - verified)
181
+
182
+ ---
183
+
184
+ ## 11. Decision Log
185
+
186
+ | Date | Decision | Rationale |
187
+ |------|----------|-----------|
188
+ | 2025-11-27 | Pause refactor merge | Discovered agent framework and pydantic-ai are complementary, not exclusive |
189
+ | TBD | ? | Awaiting decision on path forward |
docs/brainstorming/magentic-pydantic/01_ARCHITECTURE_SPEC.md ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture Specification: Dual-Mode Agent System
2
+
3
+ **Date:** November 27, 2025
4
+ **Status:** SPECIFICATION
5
+ **Goal:** Graceful degradation from full multi-agent orchestration to simple single-agent mode
6
+
7
+ ---
8
+
9
+ ## 1. Core Concept: Two Operating Modes
10
+
11
+ ```text
12
+ ┌─────────────────────────────────────────────────────────────────────┐
13
+ │ USER REQUEST │
14
+ │ │ │
15
+ │ ▼ │
16
+ │ ┌─────────────────┐ │
17
+ │ │ Mode Selection │ │
18
+ │ │ (Auto-detect) │ │
19
+ │ └────────┬────────┘ │
20
+ │ │ │
21
+ │ ┌───────────────┴───────────────┐ │
22
+ │ │ │ │
23
+ │ ▼ ▼ │
24
+ │ ┌─────────────────┐ ┌─────────────────┐ │
25
+ │ │ SIMPLE MODE │ │ ADVANCED MODE │ │
26
+ │ │ (Free Tier) │ │ (Paid Tier) │ │
27
+ │ │ │ │ │ │
28
+ │ │ pydantic-ai │ │ MS Agent Fwk │ │
29
+ │ │ single-agent │ │ + pydantic-ai │ │
30
+ │ │ loop │ │ multi-agent │ │
31
+ │ └─────────────────┘ └─────────────────┘ │
32
+ │ │ │ │
33
+ │ └───────────────┬───────────────┘ │
34
+ │ ▼ │
35
+ │ ┌─────────────────┐ │
36
+ │ │ Research Report │ │
37
+ │ │ with Citations │ │
38
+ │ └─────────────────┘ │
39
+ └─────────────────────────────────────────────────────────────────────┘
40
+ ```
41
+
42
+ ---
43
+
44
+ ## 2. Mode Comparison
45
+
46
+ | Aspect | Simple Mode | Advanced Mode |
47
+ |--------|-------------|---------------|
48
+ | **Trigger** | No API key OR `LLM_PROVIDER=huggingface` | OpenAI API key present (currently OpenAI only) |
49
+ | **Framework** | pydantic-ai only | Microsoft Agent Framework + pydantic-ai |
50
+ | **Architecture** | Single orchestrator loop | Multi-agent coordination |
51
+ | **Agents** | One agent does Search→Judge→Report | SearchAgent, JudgeAgent, ReportAgent, AnalysisAgent |
52
+ | **State Management** | Simple dict | Thread-safe `MagenticState` with context vars |
53
+ | **Quality** | Good (functional) | Better (specialized agents, coordination) |
54
+ | **Cost** | Free (HuggingFace Inference) | Paid (OpenAI/Anthropic) |
55
+ | **Use Case** | Demos, hackathon, budget-constrained | Production, research quality |
56
+
57
+ ---
58
+
59
+ ## 3. Simple Mode Architecture (pydantic-ai Only)
60
+
61
+ ```text
62
+ ┌─────────────────────────────────────────────────────┐
63
+ │ Orchestrator │
64
+ │ │
65
+ │ while not sufficient and iteration < max: │
66
+ │ 1. SearchHandler.execute(query) │
67
+ │ 2. JudgeHandler.assess(evidence) ◄── pydantic-ai Agent │
68
+ │ 3. if sufficient: break │
69
+ │ 4. query = judge.next_queries │
70
+ │ │
71
+ │ return ReportGenerator.generate(evidence) │
72
+ └─────────────────────────────────────────────────────┘
73
+ ```
74
+
75
+ **Components:**
76
+ - `src/orchestrator.py` - Simple loop orchestrator
77
+ - `src/agent_factory/judges.py` - JudgeHandler with pydantic-ai
78
+ - `src/tools/search_handler.py` - Scatter-gather search
79
+ - `src/tools/pubmed.py`, `clinicaltrials.py`, `europepmc.py` - Search tools
80
+
81
+ ---
82
+
83
+ ## 4. Advanced Mode Architecture (MS Agent Framework + pydantic-ai)
84
+
85
+ ```text
86
+ ┌─────────────────────────────────────────────────────────────────────┐
87
+ │ Microsoft Agent Framework Orchestrator │
88
+ │ │
89
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
90
+ │ │ SearchAgent │───▶│ JudgeAgent │───▶│ ReportAgent │ │
91
+ │ │ (BaseAgent) │ │ (BaseAgent) │ │ (BaseAgent) │ │
92
+ │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
93
+ │ │ │ │ │
94
+ │ ▼ ▼ ▼ │
95
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
96
+ │ │ pydantic-ai │ │ pydantic-ai │ │ pydantic-ai │ │
97
+ │ │ Agent() │ │ Agent() │ │ Agent() │ │
98
+ │ │ output_type=│ │ output_type=│ │ output_type=│ │
99
+ │ │ SearchResult│ │ JudgeAssess │ │ Report │ │
100
+ │ └─────────────┘ └─────────────┘ └─────────────┘ │
101
+ │ │
102
+ │ Shared State: MagenticState (thread-safe via contextvars) │
103
+ │ - evidence: list[Evidence] │
104
+ │ - embedding_service: EmbeddingService │
105
+ └─────────────────────────────────────────────────────────────────────┘
106
+ ```
107
+
108
+ **Components:**
109
+ - `src/orchestrator_magentic.py` - Multi-agent orchestrator
110
+ - `src/agents/search_agent.py` - SearchAgent (BaseAgent)
111
+ - `src/agents/judge_agent.py` - JudgeAgent (BaseAgent)
112
+ - `src/agents/report_agent.py` - ReportAgent (BaseAgent)
113
+ - `src/agents/analysis_agent.py` - AnalysisAgent (BaseAgent)
114
+ - `src/agents/state.py` - Thread-safe state management
115
+ - `src/agents/tools.py` - @ai_function decorated tools
116
+
117
+ ---
118
+
119
+ ## 5. Mode Selection Logic
120
+
121
+ ```python
122
+ # src/orchestrator_factory.py (actual implementation)
123
+
124
+ def create_orchestrator(
125
+ search_handler: SearchHandlerProtocol | None = None,
126
+ judge_handler: JudgeHandlerProtocol | None = None,
127
+ config: OrchestratorConfig | None = None,
128
+ mode: Literal["simple", "magentic", "advanced"] | None = None,
129
+ ) -> Any:
130
+ """
131
+ Auto-select orchestrator based on available credentials.
132
+
133
+ Priority:
134
+ 1. If mode explicitly set, use that
135
+ 2. If OpenAI key available -> Advanced Mode (currently OpenAI only)
136
+ 3. Otherwise -> Simple Mode (HuggingFace free tier)
137
+ """
138
+ effective_mode = _determine_mode(mode)
139
+
140
+ if effective_mode == "advanced":
141
+ orchestrator_cls = _get_magentic_orchestrator_class()
142
+ return orchestrator_cls(max_rounds=config.max_iterations if config else 10)
143
+
144
+ # Simple mode requires handlers
145
+ if search_handler is None or judge_handler is None:
146
+ raise ValueError("Simple mode requires search_handler and judge_handler")
147
+
148
+ return Orchestrator(
149
+ search_handler=search_handler,
150
+ judge_handler=judge_handler,
151
+ config=config,
152
+ )
153
+ ```
154
+
155
+ ---
156
+
157
+ ## 6. Shared Components (Both Modes Use)
158
+
159
+ These components work in both modes:
160
+
161
+ | Component | Purpose |
162
+ |-----------|---------|
163
+ | `src/tools/pubmed.py` | PubMed search |
164
+ | `src/tools/clinicaltrials.py` | ClinicalTrials.gov search |
165
+ | `src/tools/europepmc.py` | Europe PMC search |
166
+ | `src/tools/search_handler.py` | Scatter-gather orchestration |
167
+ | `src/tools/rate_limiter.py` | Rate limiting |
168
+ | `src/utils/models.py` | Evidence, Citation, JudgeAssessment |
169
+ | `src/utils/config.py` | Settings |
170
+ | `src/services/embeddings.py` | Vector search (optional) |
171
+
172
+ ---
173
+
174
+ ## 7. pydantic-ai Integration Points
175
+
176
+ Both modes use pydantic-ai for structured LLM outputs:
177
+
178
+ ```python
179
+ # In JudgeHandler (both modes)
180
+ from pydantic_ai import Agent
181
+ from pydantic_ai.models.huggingface import HuggingFaceModel
182
+ from pydantic_ai.models.openai import OpenAIModel
183
+ from pydantic_ai.models.anthropic import AnthropicModel
184
+
185
+ class JudgeHandler:
186
+ def __init__(self, model: Any = None):
187
+ self.model = model or get_model() # Auto-selects based on config
188
+ self.agent = Agent(
189
+ model=self.model,
190
+ output_type=JudgeAssessment, # Structured output!
191
+ system_prompt=SYSTEM_PROMPT,
192
+ )
193
+
194
+ async def assess(self, question: str, evidence: list[Evidence]) -> JudgeAssessment:
195
+ result = await self.agent.run(format_prompt(question, evidence))
196
+ return result.output # Guaranteed to be JudgeAssessment
197
+ ```
198
+
199
+ ---
200
+
201
+ ## 8. Microsoft Agent Framework Integration Points
202
+
203
+ Advanced mode wraps pydantic-ai agents in BaseAgent:
204
+
205
+ ```python
206
+ # In JudgeAgent (advanced mode only)
207
+ from agent_framework import BaseAgent, AgentRunResponse, ChatMessage, Role
208
+
209
+ class JudgeAgent(BaseAgent):
210
+ def __init__(self, judge_handler: JudgeHandlerProtocol):
211
+ super().__init__(
212
+ name="JudgeAgent",
213
+ description="Evaluates evidence quality",
214
+ )
215
+ self._handler = judge_handler # Uses pydantic-ai internally
216
+
217
+ async def run(self, messages, **kwargs) -> AgentRunResponse:
218
+ question = extract_question(messages)
219
+ evidence = self._evidence_store.get("current", [])
220
+
221
+ # Delegate to pydantic-ai powered handler
222
+ assessment = await self._handler.assess(question, evidence)
223
+
224
+ return AgentRunResponse(
225
+ messages=[ChatMessage(role=Role.ASSISTANT, text=format_response(assessment))],
226
+ additional_properties={"assessment": assessment.model_dump()},
227
+ )
228
+ ```
229
+
230
+ ---
231
+
232
+ ## 9. Benefits of This Architecture
233
+
234
+ 1. **Graceful Degradation**: Works without API keys (free tier)
235
+ 2. **Progressive Enhancement**: Better with API keys (orchestration)
236
+ 3. **Code Reuse**: pydantic-ai handlers shared between modes
237
+ 4. **Hackathon Ready**: Demo works without requiring paid keys
238
+ 5. **Production Ready**: Full orchestration available when needed
239
+ 6. **Future Proof**: Can add more agents to advanced mode
240
+ 7. **Testable**: Simple mode is easier to unit test
241
+
242
+ ---
243
+
244
+ ## 10. Known Risks and Mitigations
245
+
246
+ > **From Senior Agent Review**
247
+
248
+ ### 10.1 Bridge Complexity (MEDIUM)
249
+
250
+ **Risk:** In Advanced Mode, agents (Agent Framework) wrap handlers (pydantic-ai). Both are async. Context variables (`MagenticState`) must propagate correctly through the pydantic-ai call stack.
251
+
252
+ **Mitigation:**
253
+ - pydantic-ai uses standard Python `contextvars`, which naturally propagate through `await` chains
254
+ - Test context propagation explicitly in integration tests
255
+ - If issues arise, pass state explicitly rather than via context vars
256
+
257
+ ### 10.2 Integration Drift (MEDIUM)
258
+
259
+ **Risk:** Simple Mode and Advanced Mode might diverge in behavior over time (e.g., Simple Mode uses logic A, Advanced Mode uses logic B).
260
+
261
+ **Mitigation:**
262
+ - Both modes MUST call the exact same underlying Tools (`src/tools/*`) and Handlers (`src/agent_factory/*`)
263
+ - Handlers are the single source of truth for business logic
264
+ - Agents are thin wrappers that delegate to handlers
265
+
266
+ ### 10.3 Testing Burden (LOW-MEDIUM)
267
+
268
+ **Risk:** Two distinct orchestrators (`src/orchestrator.py` and `src/orchestrator_magentic.py`) doubles integration testing surface area.
269
+
270
+ **Mitigation:**
271
+ - Unit test handlers independently (shared code)
272
+ - Integration tests for each mode separately
273
+ - End-to-end tests verify same output for same input (determinism permitting)
274
+
275
+ ### 10.4 Dependency Conflicts (LOW)
276
+
277
+ **Risk:** `agent-framework-core` might conflict with `pydantic-ai`'s dependencies (e.g., different pydantic versions).
278
+
279
+ **Status:** Both use `pydantic>=2.x`. Should be compatible.
280
+
281
+ ---
282
+
283
+ ## 11. Naming Clarification
284
+
285
+ > See `00_SITUATION_AND_PLAN.md` Section 4 for full details.
286
+
287
+ **Important:** The codebase uses "magentic" in file names (`orchestrator_magentic.py`, `magentic_agents.py`) but this refers to our internal naming for Microsoft Agent Framework integration, **NOT** the `magentic` PyPI package.
288
+
289
+ **Future action:** Rename to `orchestrator_advanced.py` to eliminate confusion.
docs/brainstorming/magentic-pydantic/02_IMPLEMENTATION_PHASES.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Implementation Phases: Dual-Mode Agent System
2
+
3
+ **Date:** November 27, 2025
4
+ **Status:** IMPLEMENTATION PLAN (REVISED)
5
+ **Strategy:** TDD (Test-Driven Development), SOLID Principles
6
+ **Dependency Strategy:** PyPI (agent-framework-core)
7
+
8
+ ---
9
+
10
+ ## Phase 0: Environment Validation & Cleanup
11
+
12
+ **Goal:** Ensure clean state and dependencies are correctly installed.
13
+
14
+ ### Step 0.1: Verify PyPI Package
15
+ The `agent-framework-core` package is published on PyPI by Microsoft. Verify installation:
16
+
17
+ ```bash
18
+ uv sync --all-extras
19
+ python -c "from agent_framework import ChatAgent; print('OK')"
20
+ ```
21
+
22
+ ### Step 0.2: Branch State
23
+ We are on `feat/dual-mode-architecture`. Ensure it is up to date with `origin/dev` before starting.
24
+
25
+ **Note:** The `reference_repos/agent-framework` folder is kept for reference/documentation only.
26
+ The production dependency uses the official PyPI release.
27
+
28
+ ---
29
+
30
+ ## Phase 1: Pydantic-AI Improvements (Simple Mode)
31
+
32
+ **Goal:** Implement `HuggingFaceModel` support in `JudgeHandler` using strict TDD.
33
+
34
+ ### Step 1.1: Test First (Red)
35
+ Create `tests/unit/agent_factory/test_judges_factory.py`:
36
+ - Test `get_model()` returns `HuggingFaceModel` when `LLM_PROVIDER=huggingface`.
37
+ - Test `get_model()` respects `HF_TOKEN`.
38
+ - Test fallback to OpenAI.
39
+
40
+ ### Step 1.2: Implementation (Green)
41
+ Update `src/utils/config.py`:
42
+ - Add `huggingface_model` and `hf_token` fields.
43
+
44
+ Update `src/agent_factory/judges.py`:
45
+ - Implement `get_model` with the logic derived from the tests.
46
+ - Use dependency injection for the model where possible.
47
+
48
+ ### Step 1.3: Refactor
49
+ Ensure `JudgeHandler` is loosely coupled from the specific model provider.
50
+
51
+ ---
52
+
53
+ ## Phase 2: Orchestrator Factory (The Switch)
54
+
55
+ **Goal:** Implement the factory pattern to switch between Simple and Advanced modes.
56
+
57
+ ### Step 2.1: Test First (Red)
58
+ Create `tests/unit/test_orchestrator_factory.py`:
59
+ - Test `create_orchestrator` returns `Orchestrator` (simple) when API keys are missing.
60
+ - Test `create_orchestrator` returns `MagenticOrchestrator` (advanced) when OpenAI key exists.
61
+ - Test explicit mode override.
62
+
63
+ ### Step 2.2: Implementation (Green)
64
+ Update `src/orchestrator_factory.py` to implement the selection logic.
65
+
66
+ ---
67
+
68
+ ## Phase 3: Agent Framework Integration (Advanced Mode)
69
+
70
+ **Goal:** Integrate Microsoft Agent Framework from PyPI.
71
+
72
+ ### Step 3.1: Dependency Management
73
+ The `agent-framework-core` package is installed from PyPI:
74
+ ```toml
75
+ [project.optional-dependencies]
76
+ magentic = [
77
+ "agent-framework-core>=1.0.0b251120,<2.0.0", # Microsoft Agent Framework (PyPI)
78
+ ]
79
+ ```
80
+ Install with: `uv sync --all-extras`
81
+
82
+ ### Step 3.2: Verify Imports (Test First)
83
+ Create `tests/unit/agents/test_agent_imports.py`:
84
+ - Verify `from agent_framework import ChatAgent` works.
85
+ - Verify instantiation of `ChatAgent` with a mock client.
86
+
87
+ ### Step 3.3: Update Agents
88
+ Refactor `src/agents/*.py` to ensure they match the exact signature of the local `ChatAgent` class.
89
+ - **SOLID:** Ensure agents have single responsibilities.
90
+ - **DRY:** Share tool definitions between Pydantic-AI simple mode and Agent Framework advanced mode.
91
+
92
+ ---
93
+
94
+ ## Phase 4: UI & End-to-End Verification
95
+
96
+ **Goal:** Update Gradio to reflect the active mode.
97
+
98
+ ### Step 4.1: UI Updates
99
+ Update `src/app.py` to display "Simple Mode" vs "Advanced Mode".
100
+
101
+ ### Step 4.2: End-to-End Test
102
+ Run the full loop:
103
+ 1. Simple Mode (No Keys) -> Search -> Judge (HF) -> Report.
104
+ 2. Advanced Mode (OpenAI Key) -> SearchAgent -> JudgeAgent -> ReportAgent.
105
+
106
+ ---
107
+
108
+ ## Phase 5: Cleanup & Documentation
109
+
110
+ - Remove unused code.
111
+ - Update main README.md.
112
+ - Final `make check`.
docs/brainstorming/magentic-pydantic/03_IMMEDIATE_ACTIONS.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Immediate Actions Checklist
2
+
3
+ **Date:** November 27, 2025
4
+ **Priority:** Execute in order
5
+
6
+ ---
7
+
8
+ ## Before Starting Implementation
9
+
10
+ ### 1. Close PR #41 (CRITICAL)
11
+
12
+ ```bash
13
+ gh pr close 41 --comment "Architecture decision changed. Cherry-picking improvements to preserve both pydantic-ai and Agent Framework capabilities."
14
+ ```
15
+
16
+ ### 2. Verify HuggingFace Spaces is Safe
17
+
18
+ ```bash
19
+ # Should show agent framework files exist
20
+ git ls-tree --name-only huggingface-upstream/dev -- src/agents/
21
+ git ls-tree --name-only huggingface-upstream/dev -- src/orchestrator_magentic.py
22
+ ```
23
+
24
+ Expected output: Files should exist (they do as of this writing).
25
+
26
+ ### 3. Clean Local Environment
27
+
28
+ ```bash
29
+ # Switch to main first
30
+ git checkout main
31
+
32
+ # Delete problematic branches
33
+ git branch -D refactor/pydantic-unification 2>/dev/null || true
34
+ git branch -D feat/pubmed-fulltext 2>/dev/null || true
35
+
36
+ # Reset local dev to origin/dev
37
+ git branch -D dev 2>/dev/null || true
38
+ git checkout -b dev origin/dev
39
+
40
+ # Verify agent framework code exists
41
+ ls src/agents/
42
+ # Expected: __init__.py, analysis_agent.py, hypothesis_agent.py, judge_agent.py,
43
+ # magentic_agents.py, report_agent.py, search_agent.py, state.py, tools.py
44
+
45
+ ls src/orchestrator_magentic.py
46
+ # Expected: file exists
47
+ ```
48
+
49
+ ### 4. Create Fresh Feature Branch
50
+
51
+ ```bash
52
+ git checkout -b feat/dual-mode-architecture origin/dev
53
+ ```
54
+
55
+ ---
56
+
57
+ ## Decision Points
58
+
59
+ Before proceeding, confirm:
60
+
61
+ 1. **For hackathon**: Do we need advanced mode, or is simple mode sufficient?
62
+ - Simple mode = faster to implement, works today
63
+ - Advanced mode = better quality, more work
64
+
65
+ 2. **Timeline**: How much time do we have?
66
+ - If < 1 day: Focus on simple mode only
67
+ - If > 1 day: Implement dual-mode
68
+
69
+ 3. **Dependencies**: Is `agent-framework-core` available?
70
+ - Check: `pip index versions agent-framework-core`
71
+ - If not on PyPI, may need to install from GitHub
72
+
73
+ ---
74
+
75
+ ## Quick Start (Simple Mode Only)
76
+
77
+ If time is limited, implement only simple mode improvements:
78
+
79
+ ```bash
80
+ # On feat/dual-mode-architecture branch
81
+
82
+ # 1. Update judges.py to add HuggingFace support
83
+ # 2. Update config.py to add HF settings
84
+ # 3. Create free_tier_demo.py
85
+ # 4. Run make check
86
+ # 5. Create PR to dev
87
+ ```
88
+
89
+ This gives you free-tier capability without touching agent framework code.
90
+
91
+ ---
92
+
93
+ ## Quick Start (Full Dual-Mode)
94
+
95
+ If time permits, implement full dual-mode:
96
+
97
+ Follow phases 1-6 in `02_IMPLEMENTATION_PHASES.md`
98
+
99
+ ---
100
+
101
+ ## Emergency Rollback
102
+
103
+ If anything goes wrong:
104
+
105
+ ```bash
106
+ # Reset to safe state
107
+ git checkout main
108
+ git branch -D feat/dual-mode-architecture
109
+ git checkout -b feat/dual-mode-architecture origin/dev
110
+ ```
111
+
112
+ Origin/dev is the safe fallback - it has agent framework intact.
docs/brainstorming/magentic-pydantic/04_FOLLOWUP_REVIEW_REQUEST.md ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Follow-Up Review Request: Did We Implement Your Feedback?
2
+
3
+ **Date:** November 27, 2025
4
+ **Context:** You previously reviewed our dual-mode architecture plan and provided feedback. We have updated the documentation. Please verify we correctly implemented your recommendations.
5
+
6
+ ---
7
+
8
+ ## Your Original Feedback vs Our Changes
9
+
10
+ ### 1. Naming Confusion Clarification
11
+
12
+ **Your feedback:** "You are using Microsoft Agent Framework, but you've named your integration 'Magentic'. This caused the confusion."
13
+
14
+ **Our change:** Added Section 4 in `00_SITUATION_AND_PLAN.md`:
15
+ ```markdown
16
+ ## 4. CRITICAL: Naming Confusion Clarification
17
+
18
+ > **Senior Agent Review Finding:** The codebase uses "magentic" in file names
19
+ > (e.g., `orchestrator_magentic.py`, `magentic_agents.py`) but this is **NOT**
20
+ > the `magentic` PyPI package by Jacky Liang. It's Microsoft Agent Framework.
21
+
22
+ **The naming confusion:**
23
+ - `magentic` (PyPI package): A different library for structured LLM outputs
24
+ - "Magentic" (in our codebase): Our internal name for Microsoft Agent Framework integration
25
+ - `agent-framework-core` (PyPI package): Microsoft's actual multi-agent orchestration framework
26
+
27
+ **Recommended future action:** Rename `orchestrator_magentic.py` → `orchestrator_advanced.py`
28
+ ```
29
+
30
+ **Status:** ✅ IMPLEMENTED
31
+
32
+ ---
33
+
34
+ ### 2. Bridge Complexity Warning
35
+
36
+ **Your feedback:** "You must ensure MagenticState (context vars) propagates correctly through the pydantic-ai call stack."
37
+
38
+ **Our change:** Added Section 10.1 in `01_ARCHITECTURE_SPEC.md`:
39
+ ```markdown
40
+ ### 10.1 Bridge Complexity (MEDIUM)
41
+
42
+ **Risk:** In Advanced Mode, agents (Agent Framework) wrap handlers (pydantic-ai).
43
+ Both are async. Context variables (`MagenticState`) must propagate correctly.
44
+
45
+ **Mitigation:**
46
+ - pydantic-ai uses standard Python `contextvars`, which naturally propagate through `await` chains
47
+ - Test context propagation explicitly in integration tests
48
+ - If issues arise, pass state explicitly rather than via context vars
49
+ ```
50
+
51
+ **Status:** ✅ IMPLEMENTED
52
+
53
+ ---
54
+
55
+ ### 3. Integration Drift Warning
56
+
57
+ **Your feedback:** "Simple Mode and Advanced Mode might diverge in behavior."
58
+
59
+ **Our change:** Added Section 10.2 in `01_ARCHITECTURE_SPEC.md`:
60
+ ```markdown
61
+ ### 10.2 Integration Drift (MEDIUM)
62
+
63
+ **Risk:** Simple Mode and Advanced Mode might diverge in behavior over time.
64
+
65
+ **Mitigation:**
66
+ - Both modes MUST call the exact same underlying Tools (`src/tools/*`) and Handlers (`src/agent_factory/*`)
67
+ - Handlers are the single source of truth for business logic
68
+ - Agents are thin wrappers that delegate to handlers
69
+ ```
70
+
71
+ **Status:** ✅ IMPLEMENTED
72
+
73
+ ---
74
+
75
+ ### 4. Testing Burden Warning
76
+
77
+ **Your feedback:** "You now have two distinct orchestrators to maintain. This doubles your integration testing surface area."
78
+
79
+ **Our change:** Added Section 10.3 in `01_ARCHITECTURE_SPEC.md`:
80
+ ```markdown
81
+ ### 10.3 Testing Burden (LOW-MEDIUM)
82
+
83
+ **Risk:** Two distinct orchestrators doubles integration testing surface area.
84
+
85
+ **Mitigation:**
86
+ - Unit test handlers independently (shared code)
87
+ - Integration tests for each mode separately
88
+ - End-to-end tests verify same output for same input
89
+ ```
90
+
91
+ **Status:** ✅ IMPLEMENTED
92
+
93
+ ---
94
+
95
+ ### 5. Rename Recommendation
96
+
97
+ **Your feedback:** "Rename `src/orchestrator_magentic.py` to `src/orchestrator_advanced.py`"
98
+
99
+ **Our change:** Added Step 3.4 in `02_IMPLEMENTATION_PHASES.md`:
100
+ ```markdown
101
+ ### Step 3.4: (OPTIONAL) Rename "Magentic" to "Advanced"
102
+
103
+ > **Senior Agent Recommendation:** Rename files to eliminate confusion.
104
+
105
+ git mv src/orchestrator_magentic.py src/orchestrator_advanced.py
106
+ git mv src/agents/magentic_agents.py src/agents/advanced_agents.py
107
+
108
+ **Note:** This is optional for the hackathon. Can be done in a follow-up PR.
109
+ ```
110
+
111
+ **Status:** ✅ DOCUMENTED (marked as optional for hackathon)
112
+
113
+ ---
114
+
115
+ ### 6. Standardize Wrapper Recommendation
116
+
117
+ **Your feedback:** "Create a generic `PydanticAiAgentWrapper(BaseAgent)` class instead of manually wrapping each handler."
118
+
119
+ **Our change:** NOT YET DOCUMENTED
120
+
121
+ **Status:** ⚠️ NOT IMPLEMENTED - Should we add this?
122
+
123
+ ---
124
+
125
+ ## Questions for Your Review
126
+
127
+ 1. **Did we correctly implement your feedback?** Are there any misunderstandings in how we interpreted your recommendations?
128
+
129
+ 2. **Is the "Standardize Wrapper" recommendation critical?** Should we add it to the implementation phases, or is it a nice-to-have for later?
130
+
131
+ 3. **Dependency versioning:** You noted `agent-framework-core>=1.0.0b251120` might be ephemeral. Should we:
132
+ - Pin to a specific version?
133
+ - Use a version range?
134
+ - Install from GitHub source?
135
+
136
+ 4. **Anything else we missed?**
137
+
138
+ ---
139
+
140
+ ## Files to Re-Review
141
+
142
+ 1. `00_SITUATION_AND_PLAN.md` - Added Section 4 (Naming Clarification)
143
+ 2. `01_ARCHITECTURE_SPEC.md` - Added Sections 10-11 (Risks, Naming)
144
+ 3. `02_IMPLEMENTATION_PHASES.md` - Added Step 3.4 (Optional Rename)
145
+
146
+ ---
147
+
148
+ ## Current Branch State
149
+
150
+ We are now on `feat/dual-mode-architecture` branched from `origin/dev`:
151
+ - ✅ Agent framework code intact (`src/agents/`, `src/orchestrator_magentic.py`)
152
+ - ✅ Documentation committed
153
+ - ❌ PR #41 still open (need to close it)
154
+ - ❌ Cherry-pick of pydantic-ai improvements not yet done
155
+
156
+ ---
157
+
158
+ Please confirm: **GO / NO-GO** to proceed with Phase 1 (cherry-picking pydantic-ai improvements)?
docs/brainstorming/magentic-pydantic/REVIEW_PROMPT_FOR_SENIOR_AGENT.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Senior Agent Review Prompt
2
+
3
+ Copy and paste everything below this line to a fresh Claude/AI session:
4
+
5
+ ---
6
+
7
+ ## Context
8
+
9
+ I am a junior developer working on a HuggingFace hackathon project called DeepCritical. We made a significant architectural mistake and are now trying to course-correct. I need you to act as a **senior staff engineer** and critically review our proposed solution.
10
+
11
+ ## The Situation
12
+
13
+ We almost merged a refactor that would have **deleted** our multi-agent orchestration capability, mistakenly believing that `pydantic-ai` (a library for structured LLM outputs) and Microsoft's `agent-framework` (a framework for multi-agent orchestration) were mutually exclusive alternatives.
14
+
15
+ **They are not.** They are complementary:
16
+ - `pydantic-ai` ensures LLM responses match Pydantic schemas (type-safe outputs)
17
+ - `agent-framework` orchestrates multiple agents working together (coordination layer)
18
+
19
+ We now want to implement a **dual-mode architecture** where:
20
+ - **Simple Mode (No API key):** Uses only pydantic-ai with HuggingFace free tier
21
+ - **Advanced Mode (With API key):** Uses Microsoft Agent Framework for orchestration, with pydantic-ai inside each agent for structured outputs
22
+
23
+ ## Your Task
24
+
25
+ Please perform a **deep, critical review** of:
26
+
27
+ 1. **The architecture diagram** (image attached: `assets/magentic-pydantic.png`)
28
+ 2. **Our documentation** (4 files listed below)
29
+ 3. **The actual codebase** to verify our claims
30
+
31
+ ## Specific Questions to Answer
32
+
33
+ ### Architecture Validation
34
+ 1. Is our understanding correct that pydantic-ai and agent-framework are complementary, not competing?
35
+ 2. Does the dual-mode architecture diagram accurately represent how these should integrate?
36
+ 3. Are there any architectural flaws or anti-patterns in our proposed design?
37
+
38
+ ### Documentation Accuracy
39
+ 4. Are the branch states we documented accurate? (Check `git log`, `git ls-tree`)
40
+ 5. Is our understanding of what code exists where correct?
41
+ 6. Are the implementation phases realistic and in the correct order?
42
+ 7. Are there any missing steps or dependencies we overlooked?
43
+
44
+ ### Codebase Reality Check
45
+ 8. Does `origin/dev` actually have the agent framework code intact? Verify by checking:
46
+ - `git ls-tree origin/dev -- src/agents/`
47
+ - `git ls-tree origin/dev -- src/orchestrator_magentic.py`
48
+ 9. What does the current `src/agents/` code actually import? Does it use `agent_framework` or `agent-framework-core`?
49
+ 10. Is the `agent-framework-core` package actually available on PyPI, or do we need to install from source?
50
+
51
+ ### Implementation Feasibility
52
+ 11. Can the cherry-pick strategy we outlined actually work, or are there merge conflicts we're not seeing?
53
+ 12. Is the mode auto-detection logic sound?
54
+ 13. What are the risks we haven't identified?
55
+
56
+ ### Critical Errors Check
57
+ 14. Did we miss anything critical in our analysis?
58
+ 15. Are there any factual errors in our documentation?
59
+ 16. Would a Google/DeepMind senior engineer approve this plan, or would they flag issues?
60
+
61
+ ## Files to Review
62
+
63
+ Please read these files in order:
64
+
65
+ 1. `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/docs/brainstorming/magentic-pydantic/00_SITUATION_AND_PLAN.md`
66
+ 2. `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/docs/brainstorming/magentic-pydantic/01_ARCHITECTURE_SPEC.md`
67
+ 3. `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/docs/brainstorming/magentic-pydantic/02_IMPLEMENTATION_PHASES.md`
68
+ 4. `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/docs/brainstorming/magentic-pydantic/03_IMMEDIATE_ACTIONS.md`
69
+
70
+ And the architecture diagram:
71
+ 5. `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/assets/magentic-pydantic.png`
72
+
73
+ ## Reference Repositories to Consult
74
+
75
+ We have local clones of the source-of-truth repositories:
76
+
77
+ - **Original DeepCritical:** `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/reference_repos/DeepCritical/`
78
+ - **Microsoft Agent Framework:** `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/reference_repos/agent-framework/`
79
+ - **Microsoft AutoGen:** `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/reference_repos/autogen-microsoft/`
80
+
81
+ Please cross-reference our hackathon fork against these to verify architectural alignment.
82
+
83
+ ## Codebase to Analyze
84
+
85
+ Our hackathon fork is at:
86
+ `/Users/ray/Desktop/CLARITY-DIGITAL-TWIN/DeepCritical-1/`
87
+
88
+ Key files to examine:
89
+ - `src/agents/` - Agent framework integration
90
+ - `src/agent_factory/judges.py` - pydantic-ai integration
91
+ - `src/orchestrator.py` - Simple mode orchestrator
92
+ - `src/orchestrator_magentic.py` - Advanced mode orchestrator
93
+ - `src/orchestrator_factory.py` - Mode selection
94
+ - `pyproject.toml` - Dependencies
95
+
96
+ ## Expected Output
97
+
98
+ Please provide:
99
+
100
+ 1. **Validation Summary:** Is our plan sound? (YES/NO with explanation)
101
+ 2. **Errors Found:** List any factual errors in our documentation
102
+ 3. **Missing Items:** What did we overlook?
103
+ 4. **Risk Assessment:** What could go wrong?
104
+ 5. **Recommended Changes:** Specific edits to our documentation or plan
105
+ 6. **Go/No-Go Recommendation:** Should we proceed with this plan?
106
+
107
+ ## Tone
108
+
109
+ Be brutally honest. If our plan is flawed, say so directly. We would rather know now than after implementation. Don't soften criticism - we need accuracy.
110
+
111
+ ---
112
+
113
+ END OF PROMPT
docs/bugs/FIX_PLAN_MAGENTIC_MODE.md ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fix Plan: Magentic Mode Report Generation
2
+
3
+ **Related Bug**: `P0_MAGENTIC_MODE_BROKEN.md`
4
+ **Approach**: Test-Driven Development (TDD)
5
+ **Estimated Scope**: 4 tasks, ~2-3 hours
6
+
7
+ ---
8
+
9
+ ## Problem Summary
10
+
11
+ Magentic mode runs but fails to produce readable reports due to:
12
+
13
+ 1. **Primary Bug**: `MagenticFinalResultEvent.message` returns `ChatMessage` object, not text
14
+ 2. **Secondary Bug**: Max rounds (3) reached before ReportAgent completes
15
+ 3. **Tertiary Issues**: Stale "bioRxiv" references in prompts
16
+
17
+ ---
18
+
19
+ ## Fix Order (TDD)
20
+
21
+ ### Phase 1: Write Failing Tests
22
+
23
+ **Task 1.1**: Create test for ChatMessage text extraction
24
+
25
+ ```python
26
+ # tests/unit/test_orchestrator_magentic.py
27
+
28
+ def test_process_event_extracts_text_from_chat_message():
29
+ """Final result event should extract text from ChatMessage object."""
30
+ # Arrange: Mock ChatMessage with .content attribute
31
+ # Act: Call _process_event with MagenticFinalResultEvent
32
+ # Assert: Returned AgentEvent.message is a string, not object repr
33
+ ```
34
+
35
+ **Task 1.2**: Create test for max rounds configuration
36
+
37
+ ```python
38
+ def test_orchestrator_uses_configured_max_rounds():
39
+ """MagenticOrchestrator should use max_rounds from constructor."""
40
+ # Arrange: Create orchestrator with max_rounds=10
41
+ # Act: Build workflow
42
+ # Assert: Workflow has max_round_count=10
43
+ ```
44
+
45
+ **Task 1.3**: Create test for bioRxiv reference removal
46
+
47
+ ```python
48
+ def test_task_prompt_references_europe_pmc():
49
+ """Task prompt should reference Europe PMC, not bioRxiv."""
50
+ # Arrange: Create orchestrator
51
+ # Act: Check task string in run()
52
+ # Assert: Contains "Europe PMC", not "bioRxiv"
53
+ ```
54
+
55
+ ---
56
+
57
+ ### Phase 2: Fix ChatMessage Text Extraction
58
+
59
+ **File**: `src/orchestrator_magentic.py`
60
+ **Lines**: 192-199
61
+
62
+ **Current Code**:
63
+ ```python
64
+ elif isinstance(event, MagenticFinalResultEvent):
65
+ text = event.message.text if event.message else "No result"
66
+ ```
67
+
68
+ **Fixed Code**:
69
+ ```python
70
+ elif isinstance(event, MagenticFinalResultEvent):
71
+ if event.message:
72
+ # ChatMessage may have .content or .text depending on version
73
+ if hasattr(event.message, 'content') and event.message.content:
74
+ text = str(event.message.content)
75
+ elif hasattr(event.message, 'text') and event.message.text:
76
+ text = str(event.message.text)
77
+ else:
78
+ # Fallback: convert entire message to string
79
+ text = str(event.message)
80
+ else:
81
+ text = "No result generated"
82
+ ```
83
+
84
+ **Why**: The `agent_framework.ChatMessage` object structure may vary. We need defensive extraction.
85
+
86
+ ---
87
+
88
+ ### Phase 3: Fix Max Rounds Configuration
89
+
90
+ **File**: `src/orchestrator_magentic.py`
91
+ **Lines**: 97-99
92
+
93
+ **Current Code**:
94
+ ```python
95
+ .with_standard_manager(
96
+ chat_client=manager_client,
97
+ max_round_count=self._max_rounds, # Already uses config
98
+ max_stall_count=3,
99
+ max_reset_count=2,
100
+ )
101
+ ```
102
+
103
+ **Issue**: Default `max_rounds` in `__init__` is 10, but workflow may need more for complex queries.
104
+
105
+ **Fix**: Verify the value flows through correctly. Add logging.
106
+
107
+ ```python
108
+ logger.info(
109
+ "Building Magentic workflow",
110
+ max_rounds=self._max_rounds,
111
+ max_stall=3,
112
+ max_reset=2,
113
+ )
114
+ ```
115
+
116
+ **Also check**: `src/orchestrator_factory.py` passes config correctly:
117
+ ```python
118
+ return MagenticOrchestrator(
119
+ max_rounds=config.max_iterations if config else 10,
120
+ )
121
+ ```
122
+
123
+ ---
124
+
125
+ ### Phase 4: Fix Stale bioRxiv References
126
+
127
+ **Files to update**:
128
+
129
+ | File | Line | Change |
130
+ |------|------|--------|
131
+ | `src/orchestrator_magentic.py` | 131 | "bioRxiv" → "Europe PMC" |
132
+ | `src/agents/magentic_agents.py` | 32-33 | "bioRxiv" → "Europe PMC" |
133
+ | `src/app.py` | 202-203 | "bioRxiv" → "Europe PMC" |
134
+
135
+ **Search command to verify**:
136
+ ```bash
137
+ grep -rn "bioRxiv\|biorxiv" src/
138
+ ```
139
+
140
+ ---
141
+
142
+ ## Implementation Checklist
143
+
144
+ ```
145
+ [ ] Phase 1: Write failing tests
146
+ [ ] 1.1 Test ChatMessage text extraction
147
+ [ ] 1.2 Test max rounds configuration
148
+ [ ] 1.3 Test Europe PMC references
149
+
150
+ [ ] Phase 2: Fix ChatMessage extraction
151
+ [ ] Update _process_event() in orchestrator_magentic.py
152
+ [ ] Run test 1.1 - should pass
153
+
154
+ [ ] Phase 3: Fix max rounds
155
+ [ ] Add logging to _build_workflow()
156
+ [ ] Verify factory passes config correctly
157
+ [ ] Run test 1.2 - should pass
158
+
159
+ [ ] Phase 4: Fix bioRxiv references
160
+ [ ] Update orchestrator_magentic.py task prompt
161
+ [ ] Update magentic_agents.py descriptions
162
+ [ ] Update app.py UI text
163
+ [ ] Run test 1.3 - should pass
164
+ [ ] Run grep to verify no remaining refs
165
+
166
+ [ ] Final Verification
167
+ [ ] make check passes
168
+ [ ] All tests pass (108+)
169
+ [ ] Manual test: run_magentic.py produces readable report
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Test Commands
175
+
176
+ ```bash
177
+ # Run specific test file
178
+ uv run pytest tests/unit/test_orchestrator_magentic.py -v
179
+
180
+ # Run all tests
181
+ uv run pytest tests/unit/ -v
182
+
183
+ # Full check
184
+ make check
185
+
186
+ # Manual integration test
187
+ set -a && source .env && set +a
188
+ uv run python examples/orchestrator_demo/run_magentic.py "metformin alzheimer"
189
+ ```
190
+
191
+ ---
192
+
193
+ ## Success Criteria
194
+
195
+ 1. `run_magentic.py` outputs a readable research report (not `<ChatMessage object>`)
196
+ 2. Report includes: Executive Summary, Key Findings, Drug Candidates, References
197
+ 3. No "Max round count reached" error with default settings
198
+ 4. No "bioRxiv" references anywhere in codebase
199
+ 5. All 108+ tests pass
200
+ 6. `make check` passes
201
+
202
+ ---
203
+
204
+ ## Files Modified
205
+
206
+ ```
207
+ src/
208
+ ├── orchestrator_magentic.py # ChatMessage fix, logging
209
+ ├── agents/magentic_agents.py # bioRxiv → Europe PMC
210
+ └── app.py # bioRxiv → Europe PMC
211
+
212
+ tests/unit/
213
+ └── test_orchestrator_magentic.py # NEW: 3 tests
214
+ ```
215
+
216
+ ---
217
+
218
+ ## Notes for AI Agent
219
+
220
+ When implementing this fix plan:
221
+
222
+ 1. **DO NOT** create mock data or fake responses
223
+ 2. **DO** write real tests that verify actual behavior
224
+ 3. **DO** run `make check` after each phase
225
+ 4. **DO** test with real OpenAI API key via `.env`
226
+ 5. **DO** preserve existing functionality - simple mode must still work
227
+ 6. **DO NOT** over-engineer - minimal changes to fix the specific bugs
docs/bugs/P0_ACTIONABLE_FIXES.md DELETED
@@ -1,281 +0,0 @@
1
- # P0 Actionable Fixes - What to Do
2
-
3
- **Date:** November 27, 2025
4
- **Status:** ACTIONABLE
5
-
6
- ---
7
-
8
- ## Summary: What's Broken and What's Fixable
9
-
10
- | Tool | Problem | Fixable? | How |
11
- |------|---------|----------|-----|
12
- | BioRxiv | API has NO search endpoint | **NO** | Replace with Europe PMC |
13
- | PubMed | No query preprocessing | **YES** | Add query cleaner |
14
- | ClinicalTrials | No filters applied | **YES** | Add filter params |
15
- | Magentic Framework | Nothing wrong | N/A | Already working |
16
-
17
- ---
18
-
19
- ## FIX 1: Replace BioRxiv with Europe PMC (30 min)
20
-
21
- ### Why BioRxiv Can't Be Fixed
22
-
23
- The bioRxiv API only has this endpoint:
24
- ```
25
- https://api.biorxiv.org/details/{server}/{date-range}/{cursor}/json
26
- ```
27
-
28
- This returns papers **by date**, not by keyword. There is NO search endpoint.
29
-
30
- **Proof:** I queried `medrxiv/2024-01-01/2024-01-02` and got:
31
- - "Global risk of Plasmodium falciparum" (malaria)
32
- - "Multiple Endocrine Neoplasia in India"
33
- - "Acupuncture for Acute Musculoskeletal Pain"
34
-
35
- **None of these are about Long COVID** because the API doesn't search.
36
-
37
- ### Europe PMC Has Search + Preprints
38
-
39
- ```bash
40
- curl "https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=long+covid+treatment&resultType=core&pageSize=3&format=json"
41
- ```
42
-
43
- Returns 283,058 results including:
44
- - "Long COVID Treatment No Silver Bullets, Only a Few Bronze BBs" ✅
45
-
46
- ### The Fix
47
-
48
- Replace `src/tools/biorxiv.py` with `src/tools/europepmc.py`:
49
-
50
- ```python
51
- """Europe PMC preprint and paper search tool."""
52
-
53
- import httpx
54
- from src.utils.models import Citation, Evidence
55
-
56
- class EuropePMCTool:
57
- """Search Europe PMC for papers and preprints."""
58
-
59
- BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
60
-
61
- @property
62
- def name(self) -> str:
63
- return "europepmc"
64
-
65
- async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
66
- """Search Europe PMC (includes preprints from bioRxiv/medRxiv)."""
67
- params = {
68
- "query": query,
69
- "resultType": "core",
70
- "pageSize": max_results,
71
- "format": "json",
72
- }
73
-
74
- async with httpx.AsyncClient(timeout=30.0) as client:
75
- response = await client.get(self.BASE_URL, params=params)
76
- response.raise_for_status()
77
-
78
- data = response.json()
79
- results = data.get("resultList", {}).get("result", [])
80
-
81
- return [self._to_evidence(r) for r in results]
82
-
83
- def _to_evidence(self, result: dict) -> Evidence:
84
- """Convert Europe PMC result to Evidence."""
85
- title = result.get("title", "Untitled")
86
- abstract = result.get("abstractText", "No abstract")
87
- doi = result.get("doi", "")
88
- pub_year = result.get("pubYear", "Unknown")
89
- source = result.get("source", "europepmc")
90
-
91
- # Mark preprints
92
- pub_type = result.get("pubTypeList", {}).get("pubType", [])
93
- is_preprint = "Preprint" in pub_type
94
-
95
- content = f"{'[PREPRINT] ' if is_preprint else ''}{abstract[:1800]}"
96
-
97
- return Evidence(
98
- content=content,
99
- citation=Citation(
100
- source="europepmc" if not is_preprint else "preprint",
101
- title=title[:500],
102
- url=f"https://doi.org/{doi}" if doi else "",
103
- date=str(pub_year),
104
- ),
105
- relevance=0.75 if is_preprint else 0.9,
106
- )
107
- ```
108
-
109
- ---
110
-
111
- ## FIX 2: Add PubMed Query Preprocessing (1 hour)
112
-
113
- ### Current Problem
114
-
115
- User enters: `What medications show promise for Long COVID?`
116
- PubMed receives: `What medications show promise for Long COVID?`
117
-
118
- The question words pollute the search.
119
-
120
- ### The Fix
121
-
122
- Add `src/tools/query_utils.py`:
123
-
124
- ```python
125
- """Query preprocessing utilities."""
126
-
127
- import re
128
-
129
- # Question words to remove
130
- QUESTION_WORDS = {
131
- "what", "which", "how", "why", "when", "where", "who",
132
- "is", "are", "can", "could", "would", "should", "do", "does",
133
- "show", "promise", "help", "treat", "cure",
134
- }
135
-
136
- # Medical synonyms to expand
137
- SYNONYMS = {
138
- "long covid": ["long COVID", "PASC", "post-COVID syndrome", "post-acute sequelae"],
139
- "alzheimer": ["Alzheimer's disease", "AD", "Alzheimer dementia"],
140
- "cancer": ["neoplasm", "tumor", "malignancy", "carcinoma"],
141
- }
142
-
143
- def preprocess_pubmed_query(raw_query: str) -> str:
144
- """Convert natural language to cleaner PubMed query."""
145
- # Lowercase
146
- query = raw_query.lower()
147
-
148
- # Remove question marks
149
- query = query.replace("?", "")
150
-
151
- # Remove question words
152
- words = query.split()
153
- words = [w for w in words if w not in QUESTION_WORDS]
154
- query = " ".join(words)
155
-
156
- # Expand synonyms
157
- for term, expansions in SYNONYMS.items():
158
- if term in query:
159
- # Add OR clause
160
- expansion = " OR ".join([f'"{e}"' for e in expansions])
161
- query = query.replace(term, f"({expansion})")
162
-
163
- return query.strip()
164
- ```
165
-
166
- Then update `src/tools/pubmed.py`:
167
-
168
- ```python
169
- from src.tools.query_utils import preprocess_pubmed_query
170
-
171
- async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
172
- # Preprocess query
173
- clean_query = preprocess_pubmed_query(query)
174
-
175
- search_params = self._build_params(
176
- db="pubmed",
177
- term=clean_query, # Use cleaned query
178
- retmax=max_results,
179
- sort="relevance",
180
- )
181
- # ... rest unchanged
182
- ```
183
-
184
- ---
185
-
186
- ## FIX 3: Add ClinicalTrials.gov Filters (30 min)
187
-
188
- ### Current Problem
189
-
190
- Returns ALL trials including withdrawn, terminated, observational studies.
191
-
192
- ### The Fix
193
-
194
- The API supports `filter.overallStatus` and other filters. Update `src/tools/clinicaltrials.py`:
195
-
196
- ```python
197
- async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
198
- params: dict[str, str | int] = {
199
- "query.term": query,
200
- "pageSize": min(max_results, 100),
201
- "fields": "|".join(self.FIELDS),
202
- # ADD THESE FILTERS:
203
- "filter.overallStatus": "COMPLETED|RECRUITING|ACTIVE_NOT_RECRUITING",
204
- # Only interventional studies (not observational)
205
- "aggFilters": "studyType:int",
206
- }
207
- # ... rest unchanged
208
- ```
209
-
210
- **Note:** I tested the API - it supports filtering but with slightly different syntax. Check the [API docs](https://clinicaltrials.gov/data-api/api).
211
-
212
- ---
213
-
214
- ## What NOT to Change
215
-
216
- ### Microsoft Agent Framework - WORKING
217
-
218
- I verified:
219
- ```python
220
- from agent_framework import MagenticBuilder, ChatAgent
221
- from agent_framework.openai import OpenAIChatClient
222
- # All imports OK
223
-
224
- orchestrator = MagenticOrchestrator(max_rounds=2)
225
- workflow = orchestrator._build_workflow()
226
- # Workflow built successfully
227
- ```
228
-
229
- The Magentic agents are correctly wired:
230
- - SearchAgent → GPT-5.1 ✅
231
- - JudgeAgent → GPT-5.1 ✅
232
- - HypothesisAgent → GPT-5.1 ✅
233
- - ReportAgent → GPT-5.1 ✅
234
-
235
- **The framework is fine. The tools it calls are broken.**
236
-
237
- ---
238
-
239
- ## Priority Order
240
-
241
- 1. **Replace BioRxiv** → Immediate, fundamental
242
- 2. **Add PubMed preprocessing** → High impact, easy
243
- 3. **Add ClinicalTrials filters** → Medium impact, easy
244
-
245
- ---
246
-
247
- ## Test After Fixes
248
-
249
- ```bash
250
- # Test Europe PMC
251
- uv run python -c "
252
- import asyncio
253
- from src.tools.europepmc import EuropePMCTool
254
- tool = EuropePMCTool()
255
- results = asyncio.run(tool.search('long covid treatment', 3))
256
- for r in results:
257
- print(r.citation.title)
258
- "
259
-
260
- # Test PubMed with preprocessing
261
- uv run python -c "
262
- from src.tools.query_utils import preprocess_pubmed_query
263
- q = 'What medications show promise for Long COVID?'
264
- print(preprocess_pubmed_query(q))
265
- # Should output: (\"long COVID\" OR \"PASC\" OR \"post-COVID syndrome\") medications
266
- "
267
- ```
268
-
269
- ---
270
-
271
- ## After These Fixes
272
-
273
- The Magentic workflow will:
274
- 1. SearchAgent calls `search_pubmed("long COVID treatment")` → Gets RELEVANT papers
275
- 2. SearchAgent calls `search_preprints("long COVID treatment")` → Gets RELEVANT preprints via Europe PMC
276
- 3. SearchAgent calls `search_clinical_trials("long COVID")` → Gets INTERVENTIONAL trials only
277
- 4. JudgeAgent evaluates GOOD evidence
278
- 5. HypothesisAgent generates hypotheses from GOOD evidence
279
- 6. ReportAgent synthesizes GOOD report
280
-
281
- **The framework will work once we feed it good data.**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/bugs/P0_CRITICAL_BUGS.md DELETED
@@ -1,298 +0,0 @@
1
- # P0 CRITICAL BUGS - Why DeepCritical Produces Garbage Results
2
-
3
- **Date:** November 27, 2025
4
- **Status:** CRITICAL - App is functionally useless
5
- **Severity:** P0 (Blocker)
6
-
7
- ## TL;DR
8
-
9
- The app produces garbage because:
10
- 1. **BioRxiv search doesn't work** - returns random papers
11
- 2. **Free tier LLM is too dumb** - can't identify drugs
12
- 3. **Query construction is naive** - no optimization for PubMed/CT.gov syntax
13
- 4. **Loop terminates too early** - 5 iterations isn't enough
14
-
15
- ---
16
-
17
- ## P0-001: BioRxiv Search is Fundamentally Broken
18
-
19
- **File:** `src/tools/biorxiv.py:248-286`
20
-
21
- **The Problem:**
22
- The bioRxiv API **DOES NOT SUPPORT KEYWORD SEARCH**.
23
-
24
- The code does this:
25
- ```python
26
- # Fetch recent papers (last 90 days, first 100 papers)
27
- url = f"{self.BASE_URL}/{self.server}/{interval}/0/json"
28
- # Then filter client-side for keywords
29
- ```
30
-
31
- **What Actually Happens:**
32
- 1. Fetches the first 100 papers from medRxiv in the last 90 days (chronological order)
33
- 2. Filters those 100 random papers for query keywords
34
- 3. Returns whatever garbage matches
35
-
36
- **Result:** For "Long COVID medications", you get random papers like:
37
- - "Calf muscle structure-function adaptations"
38
- - "Work-Life Balance of Ophthalmologists During COVID"
39
-
40
- These papers contain "COVID" somewhere but have NOTHING to do with Long COVID treatments.
41
-
42
- **Root Cause:** The `/0/json` pagination only returns 100 papers. You'd need to paginate through ALL papers (thousands) to do proper keyword filtering.
43
-
44
- **Fix Options:**
45
- 1. **Remove BioRxiv entirely** - It's unusable without proper search API
46
- 2. **Use a different preprint aggregator** - Europe PMC has preprints WITH search
47
- 3. **Add pagination** - Fetch all papers (slow, expensive)
48
- 4. **Use Semantic Scholar API** - Has preprints and proper search
49
-
50
- ---
51
-
52
- ## P0-002: Free Tier LLM Cannot Perform Drug Identification
53
-
54
- **File:** `src/agent_factory/judges.py:153-211`
55
-
56
- **The Problem:**
57
- Without an API key, the app uses `HFInferenceJudgeHandler` with:
58
- - Llama 3.1 8B Instruct
59
- - Mistral 7B Instruct
60
-
61
- These are **7-8 billion parameter models**. They cannot:
62
- - Reliably parse complex biomedical abstracts
63
- - Identify drug candidates from scientific text
64
- - Generate structured JSON output consistently
65
- - Reason about mechanism of action
66
-
67
- **Evidence of Failure:**
68
- ```python
69
- # From MockJudgeHandler - the honest fallback when LLM fails
70
- drug_candidates=[
71
- "Drug identification requires AI analysis",
72
- "Enter API key above for full results",
73
- ]
74
- ```
75
-
76
- The team KNEW the free tier can't identify drugs and added this message.
77
-
78
- **Root Cause:** Drug repurposing requires understanding:
79
- - Drug mechanisms
80
- - Disease pathophysiology
81
- - Clinical trial phases
82
- - Statistical significance
83
-
84
- This requires GPT-4 / Claude Sonnet class models (100B+ parameters).
85
-
86
- **Fix Options:**
87
- 1. **Require API key** - No free tier, be honest
88
- 2. **Use larger HF models** - Llama 70B or Mixtral 8x7B (expensive on free tier)
89
- 3. **Hybrid approach** - Use free tier for search, require paid for synthesis
90
-
91
- ---
92
-
93
- ## P0-003: PubMed Query Not Optimized
94
-
95
- **File:** `src/tools/pubmed.py:54-71`
96
-
97
- **The Problem:**
98
- The query is passed directly to PubMed without optimization:
99
- ```python
100
- search_params = self._build_params(
101
- db="pubmed",
102
- term=query, # Raw user query!
103
- retmax=max_results,
104
- sort="relevance",
105
- )
106
- ```
107
-
108
- **What User Enters:** "What medications show promise for Long COVID?"
109
-
110
- **What PubMed Receives:** `What medications show promise for Long COVID?`
111
-
112
- **What PubMed Should Receive:**
113
- ```
114
- ("long covid"[Title/Abstract] OR "post-COVID"[Title/Abstract] OR "PASC"[Title/Abstract])
115
- AND (drug[Title/Abstract] OR treatment[Title/Abstract] OR medication[Title/Abstract] OR therapy[Title/Abstract])
116
- AND (clinical trial[Publication Type] OR randomized[Title/Abstract])
117
- ```
118
-
119
- **Root Cause:** No query preprocessing or medical term expansion.
120
-
121
- **Fix Options:**
122
- 1. **Add query preprocessor** - Extract medical entities, expand synonyms
123
- 2. **Use MeSH terms** - PubMed's controlled vocabulary for better recall
124
- 3. **LLM query generation** - Use LLM to generate optimized PubMed query
125
-
126
- ---
127
-
128
- ## P0-004: Loop Terminates Too Early
129
-
130
- **File:** `src/app.py:42-45` and `src/utils/models.py`
131
-
132
- **The Problem:**
133
- ```python
134
- config = OrchestratorConfig(
135
- max_iterations=5,
136
- max_results_per_tool=10,
137
- )
138
- ```
139
-
140
- 5 iterations is not enough to:
141
- 1. Search multiple variations of the query
142
- 2. Gather enough evidence for the Judge to synthesize
143
- 3. Refine queries based on initial results
144
-
145
- **Evidence:** The user's output shows "Max Iterations Reached" with only 6 sources.
146
-
147
- **Root Cause:** Conservative defaults to avoid API costs, but makes app useless.
148
-
149
- **Fix Options:**
150
- 1. **Increase default to 10-15** - More iterations = better results
151
- 2. **Dynamic termination** - Stop when confidence > threshold, not iteration count
152
- 3. **Parallel query expansion** - Run more queries per iteration
153
-
154
- ---
155
-
156
- ## P0-005: No Query Understanding Layer
157
-
158
- **Files:** `src/orchestrator.py`, `src/tools/search_handler.py`
159
-
160
- **The Problem:**
161
- There's no NLU (Natural Language Understanding) layer. The system:
162
- 1. Takes raw user query
163
- 2. Passes directly to search tools
164
- 3. No entity extraction
165
- 4. No intent classification
166
- 5. No query expansion
167
-
168
- For drug repurposing, you need to extract:
169
- - **Disease:** "Long COVID" → [Long COVID, PASC, Post-COVID syndrome, chronic COVID]
170
- - **Drug intent:** "medications" → [drugs, treatments, therapeutics, interventions]
171
- - **Evidence type:** "show promise" → [clinical trials, efficacy, RCT]
172
-
173
- **Root Cause:** No preprocessing pipeline between user input and search execution.
174
-
175
- **Fix Options:**
176
- 1. **Add entity extraction** - Use BioBERT or PubMedBERT for medical NER
177
- 2. **Add query expansion** - Use medical ontologies (UMLS, MeSH)
178
- 3. **LLM preprocessing** - Use LLM to generate search strategy before searching
179
-
180
- ---
181
-
182
- ## P0-006: ClinicalTrials.gov Results Not Filtered
183
-
184
- **File:** `src/tools/clinicaltrials.py`
185
-
186
- **The Problem:**
187
- ClinicalTrials.gov returns ALL matching trials including:
188
- - Withdrawn trials
189
- - Terminated trials
190
- - Not yet recruiting
191
- - Observational studies (not interventional)
192
-
193
- For drug repurposing, you want:
194
- - Interventional studies
195
- - Phase 2+ (has safety/efficacy data)
196
- - Completed or with results
197
-
198
- **Root Cause:** No filtering of trial metadata.
199
-
200
- ---
201
-
202
- ## Summary: Why This App Produces Garbage
203
-
204
- ```
205
- User Query: "What medications show promise for Long COVID?"
206
-
207
-
208
- ┌─────────────────────────────────────────────────────────────┐
209
- │ NO QUERY PREPROCESSING │
210
- │ - No entity extraction │
211
- │ - No synonym expansion │
212
- │ - No medical term normalization │
213
- └─────────────────────────────────────────────────────────────┘
214
-
215
-
216
- ┌─────────────────────────────────────────────────────────────┐
217
- │ BROKEN SEARCH LAYER │
218
- │ - PubMed: Raw query, no MeSH, gets 1 result │
219
- │ - BioRxiv: Returns random papers (API doesn't support search)│
220
- │ - ClinicalTrials: Returns all trials, no filtering │
221
- └─────────────────────────────────────────────────────────────┘
222
-
223
-
224
- ┌─────────────────────────────────────────────────────────────┐
225
- │ GARBAGE EVIDENCE │
226
- │ - 6 papers, most irrelevant │
227
- │ - "Calf muscle adaptations" (mentions COVID once) │
228
- │ - "Ophthalmologist work-life balance" │
229
- └─────────────────────────────────────────────────────────────┘
230
-
231
-
232
- ┌─────────────────────────────────────────────────────────────┐
233
- │ DUMB JUDGE (Free Tier) │
234
- │ - Llama 8B can't identify drugs from garbage │
235
- │ - JSON parsing fails │
236
- │ - Falls back to "Drug identification requires AI analysis" │
237
- └─────────────────────────────────────────────────────────────┘
238
-
239
-
240
- ┌─────────────────────────────────────────────────────────────┐
241
- │ LOOP HITS MAX (5 iterations) │
242
- │ - Never finds enough good evidence │
243
- │ - Never synthesizes anything useful │
244
- └─────────────────────────────────────────────────────────────┘
245
-
246
-
247
- GARBAGE OUTPUT
248
- ```
249
-
250
- ---
251
-
252
- ## What Would Make This Actually Work
253
-
254
- ### Minimum Viable Fix (1-2 days)
255
-
256
- 1. **Remove BioRxiv** - It doesn't work
257
- 2. **Require API key** - Be honest that free tier is useless
258
- 3. **Add basic query preprocessing** - Strip question words, expand COVID synonyms
259
- 4. **Increase iterations to 10**
260
-
261
- ### Proper Fix (1-2 weeks)
262
-
263
- 1. **Query Understanding Layer**
264
- - Medical NER (BioBERT/SciBERT)
265
- - Query expansion with MeSH/UMLS
266
- - Intent classification (drug discovery vs mechanism vs safety)
267
-
268
- 2. **Optimized Search**
269
- - PubMed: Proper query syntax with MeSH terms
270
- - ClinicalTrials: Filter by phase, status, intervention type
271
- - Replace BioRxiv with Europe PMC (has preprints + search)
272
-
273
- 3. **Evidence Ranking**
274
- - Score by publication type (RCT > cohort > case report)
275
- - Score by journal impact factor
276
- - Score by recency
277
- - Score by citation count
278
-
279
- 4. **Proper LLM Pipeline**
280
- - Use GPT-4 / Claude for synthesis
281
- - Structured extraction of: drug, mechanism, evidence level, effect size
282
- - Multi-step reasoning: identify → validate → rank → synthesize
283
-
284
- ---
285
-
286
- ## The Hard Truth
287
-
288
- Building a drug repurposing agent that works is HARD. The state of the art is:
289
-
290
- - **Drug2Disease (IBM)** - Uses knowledge graphs + ML
291
- - **COVID-KG (Stanford)** - Dedicated COVID knowledge graph
292
- - **Literature Mining at scale (PubMed)** - Millions of papers, not 10
293
-
294
- This hackathon project is fundamentally a **search wrapper with an LLM prompt**. That's not enough.
295
-
296
- To make it useful:
297
- 1. Either scope it down (e.g., "find clinical trials for X disease")
298
- 2. Or invest serious engineering in the NLU + search + ranking pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/bugs/P0_MAGENTIC_AND_SEARCH_AUDIT.md DELETED
@@ -1,249 +0,0 @@
1
- # P0 Audit: Microsoft Agent Framework (Magentic) & Search Tools
2
-
3
- **Date:** November 27, 2025
4
- **Auditor:** Claude Code
5
- **Status:** VERIFIED
6
-
7
- ---
8
-
9
- ## TL;DR
10
-
11
- | Component | Status | Verdict |
12
- |-----------|--------|---------|
13
- | Microsoft Agent Framework | ✅ WORKING | Correctly wired, no bugs |
14
- | GPT-5.1 Model Config | ✅ CORRECT | Using `gpt-5.1` as configured |
15
- | Search Tools | ❌ BROKEN | Root cause of garbage results |
16
-
17
- **The orchestration framework is fine. The search layer is garbage.**
18
-
19
- ---
20
-
21
- ## Microsoft Agent Framework Verification
22
-
23
- ### Import Test: PASSED
24
- ```python
25
- from agent_framework import MagenticBuilder, ChatAgent
26
- from agent_framework.openai import OpenAIChatClient
27
- # All imports successful
28
- ```
29
-
30
- ### Agent Creation Test: PASSED
31
- ```python
32
- from src.agents.magentic_agents import create_search_agent
33
- search_agent = create_search_agent()
34
- # SearchAgent created: SearchAgent
35
- # Description: Searches biomedical databases (PubMed, ClinicalTrials.gov, bioRxiv)
36
- ```
37
-
38
- ### Workflow Build Test: PASSED
39
- ```python
40
- from src.orchestrator_magentic import MagenticOrchestrator
41
- orchestrator = MagenticOrchestrator(max_rounds=2)
42
- workflow = orchestrator._build_workflow()
43
- # Workflow built successfully: <class 'agent_framework._workflows._workflow.Workflow'>
44
- ```
45
-
46
- ### Model Configuration: CORRECT
47
- ```python
48
- settings.openai_model = "gpt-5.1" # ✅ Using GPT-5.1, not GPT-4o
49
- settings.openai_api_key = True # ✅ API key is set
50
- ```
51
-
52
- ---
53
-
54
- ## What Magentic Provides (Working)
55
-
56
- 1. **Multi-Agent Coordination**
57
- - Manager agent orchestrates SearchAgent, JudgeAgent, HypothesisAgent, ReportAgent
58
- - Uses `MagenticBuilder().with_standard_manager()` for coordination
59
-
60
- 2. **ChatAgent Pattern**
61
- - Each agent has internal LLM (GPT-5.1)
62
- - Can call tools via `@ai_function` decorator
63
- - Has proper instructions for domain-specific tasks
64
-
65
- 3. **Workflow Streaming**
66
- - Events: `MagenticAgentMessageEvent`, `MagenticFinalResultEvent`, etc.
67
- - Real-time UI updates via `workflow.run_stream(task)`
68
-
69
- 4. **State Management**
70
- - `MagenticState` persists evidence across agents
71
- - `get_bibliography()` tool for ReportAgent
72
-
73
- ---
74
-
75
- ## What's Actually Broken: The Search Tools
76
-
77
- ### File: `src/agents/tools.py`
78
-
79
- The Magentic agents call these tools:
80
- - `search_pubmed` → Uses `PubMedTool`
81
- - `search_clinical_trials` → Uses `ClinicalTrialsTool`
82
- - `search_preprints` → Uses `BioRxivTool`
83
-
84
- **These tools are the problem, not the framework.**
85
-
86
- ---
87
-
88
- ## Search Tool Bugs (Detailed)
89
-
90
- ### BUG 1: BioRxiv API Does Not Support Search
91
-
92
- **File:** `src/tools/biorxiv.py:248-286`
93
-
94
- ```python
95
- # This fetches the FIRST 100 papers from the last 90 days
96
- # It does NOT search by keyword - the API doesn't support that
97
- url = f"{self.BASE_URL}/{self.server}/{interval}/0/json"
98
-
99
- # Then filters client-side for keywords
100
- matching = self._filter_by_keywords(papers, query_terms, max_results)
101
- ```
102
-
103
- **Problem:**
104
- - Fetches 100 random chronological papers
105
- - Filters for ANY keyword match in title/abstract
106
- - "Long COVID medications" returns papers about "calf muscles" because they mention "COVID" once
107
-
108
- **Fix:** Remove BioRxiv or use Europe PMC (which has actual search)
109
-
110
- ---
111
-
112
- ### BUG 2: PubMed Query Not Optimized
113
-
114
- **File:** `src/tools/pubmed.py:54-71`
115
-
116
- ```python
117
- search_params = self._build_params(
118
- db="pubmed",
119
- term=query, # RAW USER QUERY - no preprocessing!
120
- retmax=max_results,
121
- sort="relevance",
122
- )
123
- ```
124
-
125
- **Problem:**
126
- - User enters: "What medications show promise for Long COVID?"
127
- - PubMed receives: `What medications show promise for Long COVID?`
128
- - Should receive: `("long covid"[Title/Abstract] OR "PASC"[Title/Abstract]) AND (treatment[Title/Abstract] OR drug[Title/Abstract])`
129
-
130
- **Fix:** Add query preprocessing:
131
- 1. Strip question words (what, which, how, etc.)
132
- 2. Expand medical synonyms (Long COVID → PASC, Post-COVID)
133
- 3. Use MeSH terms for better recall
134
-
135
- ---
136
-
137
- ### BUG 3: ClinicalTrials.gov No Filtering
138
-
139
- **File:** `src/tools/clinicaltrials.py`
140
-
141
- Returns ALL trials including:
142
- - Withdrawn trials
143
- - Terminated trials
144
- - Observational studies (not drug interventions)
145
- - Phase 1 (no efficacy data)
146
-
147
- **Fix:** Filter by:
148
- - `studyType=INTERVENTIONAL`
149
- - `phase=PHASE2,PHASE3,PHASE4`
150
- - `status=COMPLETED,ACTIVE_NOT_RECRUITING,RECRUITING`
151
-
152
- ---
153
-
154
- ## Evidence: Garbage In → Garbage Out
155
-
156
- When the Magentic SearchAgent calls these tools:
157
-
158
- ```
159
- SearchAgent: "Find evidence for Long COVID medications"
160
-
161
-
162
- search_pubmed("Long COVID medications")
163
- → Returns 1 semi-relevant paper (raw query hits)
164
-
165
- search_preprints("Long COVID medications")
166
- → Returns garbage (BioRxiv API doesn't search)
167
- → "Calf muscle adaptations" (has "COVID" somewhere)
168
- → "Ophthalmologist work-life balance" (mentions COVID)
169
-
170
- search_clinical_trials("Long COVID medications")
171
- → Returns all trials, no filtering
172
-
173
-
174
- JudgeAgent receives garbage evidence
175
-
176
-
177
- HypothesisAgent can't generate good hypotheses from garbage
178
-
179
-
180
- ReportAgent produces garbage report
181
- ```
182
-
183
- **The framework is doing its job. It's orchestrating agents correctly. But the agents are being fed garbage data.**
184
-
185
- ---
186
-
187
- ## Recommended Fixes
188
-
189
- ### Priority 1: Delete or Fix BioRxiv (30 min)
190
-
191
- **Option A: Delete it**
192
- ```python
193
- # In src/agents/tools.py, remove:
194
- # from src.tools.biorxiv import BioRxivTool
195
- # _biorxiv = BioRxivTool()
196
- # @ai_function search_preprints(...)
197
- ```
198
-
199
- **Option B: Replace with Europe PMC**
200
- Europe PMC has preprints AND proper search API:
201
- ```
202
- https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=long+covid+treatment&format=json
203
- ```
204
-
205
- ### Priority 2: Fix PubMed Query (1 hour)
206
-
207
- Add query preprocessor:
208
- ```python
209
- def preprocess_query(raw_query: str) -> str:
210
- """Convert natural language to PubMed query syntax."""
211
- # Strip question words
212
- # Expand medical synonyms
213
- # Add field tags [Title/Abstract]
214
- # Return optimized query
215
- ```
216
-
217
- ### Priority 3: Filter ClinicalTrials (30 min)
218
-
219
- Add parameters to API call:
220
- ```python
221
- params = {
222
- "query.term": query,
223
- "filter.overallStatus": "COMPLETED,RECRUITING",
224
- "filter.studyType": "INTERVENTIONAL",
225
- "pageSize": max_results,
226
- }
227
- ```
228
-
229
- ---
230
-
231
- ## Conclusion
232
-
233
- **Microsoft Agent Framework: NO BUGS FOUND**
234
- - Imports work ✅
235
- - Agent creation works ✅
236
- - Workflow building works ✅
237
- - Model config correct (GPT-5.1) ✅
238
- - Streaming events work ✅
239
-
240
- **Search Tools: CRITICALLY BROKEN**
241
- - BioRxiv: API doesn't support search (fundamental)
242
- - PubMed: No query optimization (fixable)
243
- - ClinicalTrials: No filtering (fixable)
244
-
245
- **Recommendation:**
246
- 1. Delete BioRxiv immediately (unusable)
247
- 2. Add PubMed query preprocessing
248
- 3. Add ClinicalTrials filtering
249
- 4. Then the Magentic multi-agent system will work as designed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/bugs/P0_MAGENTIC_MODE_BROKEN.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # P0 Bug: Magentic Mode Returns ChatMessage Object Instead of Report Text
2
+
3
+ **Status**: OPEN
4
+ **Priority**: P0 (Critical)
5
+ **Date**: 2025-11-27
6
+
7
+ ---
8
+
9
+ ## Actual Bug Found (Not What We Thought)
10
+
11
+ **The OpenAI key works fine.** The real bug is different:
12
+
13
+ ### The Problem
14
+
15
+ When Magentic mode completes, the final report returns a `ChatMessage` object instead of the actual text:
16
+
17
+ ```
18
+ FINAL REPORT:
19
+ <agent_framework._types.ChatMessage object at 0x11db70310>
20
+ ```
21
+
22
+ ### Evidence
23
+
24
+ Full test output shows:
25
+ 1. Magentic orchestrator starts correctly
26
+ 2. SearchAgent finds evidence
27
+ 3. HypothesisAgent generates hypotheses
28
+ 4. JudgeAgent evaluates
29
+ 5. **BUT**: Final output is `ChatMessage` object, not text
30
+
31
+ ### Root Cause
32
+
33
+ In `src/orchestrator_magentic.py` line 193:
34
+
35
+ ```python
36
+ elif isinstance(event, MagenticFinalResultEvent):
37
+ text = event.message.text if event.message else "No result"
38
+ ```
39
+
40
+ The `event.message` is a `ChatMessage` object, and `.text` may not extract the content correctly, or the message structure changed in the agent-framework library.
41
+
42
+ ---
43
+
44
+ ## Secondary Issue: Max Rounds Reached
45
+
46
+ The orchestrator hits max rounds before producing a report:
47
+
48
+ ```
49
+ [ERROR] Magentic Orchestrator: Max round count reached
50
+ ```
51
+
52
+ This means the workflow times out before the ReportAgent synthesizes the final output.
53
+
54
+ ---
55
+
56
+ ## What Works
57
+
58
+ - OpenAI API key: **Works** (loaded from .env)
59
+ - SearchAgent: **Works** (finds evidence from PubMed, ClinicalTrials, Europe PMC)
60
+ - HypothesisAgent: **Works** (generates Drug -> Target -> Pathway chains)
61
+ - JudgeAgent: **Partial** (evaluates but sometimes loses context)
62
+
63
+ ---
64
+
65
+ ## Files to Fix
66
+
67
+ | File | Line | Issue |
68
+ |------|------|-------|
69
+ | `src/orchestrator_magentic.py` | 193 | `event.message.text` returns object, not string |
70
+ | `src/orchestrator_magentic.py` | 97-99 | `max_round_count=3` too low for full pipeline |
71
+
72
+ ---
73
+
74
+ ## Suggested Fix
75
+
76
+ ```python
77
+ # In _process_event, line 192-199
78
+ elif isinstance(event, MagenticFinalResultEvent):
79
+ # Handle ChatMessage object properly
80
+ if event.message:
81
+ if hasattr(event.message, 'content'):
82
+ text = event.message.content
83
+ elif hasattr(event.message, 'text'):
84
+ text = event.message.text
85
+ else:
86
+ text = str(event.message)
87
+ else:
88
+ text = "No result"
89
+ ```
90
+
91
+ And increase rounds:
92
+
93
+ ```python
94
+ # In _build_workflow, line 97
95
+ max_round_count=self._max_rounds, # Use configured value, default 10
96
+ ```
97
+
98
+ ---
99
+
100
+ ## Test Command
101
+
102
+ ```bash
103
+ set -a && source .env && set +a && uv run python examples/orchestrator_demo/run_magentic.py "metformin alzheimer"
104
+ ```
105
+
106
+ ---
107
+
108
+ ## Simple Mode Works
109
+
110
+ For reference, simple mode produces full reports:
111
+
112
+ ```bash
113
+ uv run python examples/orchestrator_demo/run_agent.py "metformin alzheimer"
114
+ ```
115
+
116
+ Output includes structured report with Drug Candidates, Key Findings, etc.
docs/bugs/P1_GRADIO_SETTINGS_CLEANUP.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # P1 Bug: Gradio Settings Accordion Not Collapsing
2
+
3
+ **Priority**: P1 (UX Bug)
4
+ **Status**: OPEN
5
+ **Date**: 2025-11-27
6
+ **Target Component**: `src/app.py`
7
+
8
+ ---
9
+
10
+ ## 1. Problem Description
11
+
12
+ The "Settings" accordion in the Gradio UI (containing Orchestrator Mode, API Key, Provider) fails to collapse, even when configured with `open=False`. It remains permanently expanded, cluttering the interface and obscuring the chat history.
13
+
14
+ ### Symptoms
15
+ - Accordion arrow toggles visually, but content remains visible.
16
+ - Occurs in both local development (`uv run src/app.py`) and HuggingFace Spaces.
17
+
18
+ ---
19
+
20
+ ## 2. Root Cause Analysis
21
+
22
+ **Definitive Cause**: Nested `Blocks` Context Bug.
23
+ `gr.ChatInterface` is itself a high-level abstraction that creates a `gr.Blocks` context. Wrapping `gr.ChatInterface` inside an external `with gr.Blocks():` context causes event listener conflicts, specifically breaking the JavaScript state management for `additional_inputs_accordion`.
24
+
25
+ **Reference**: [Gradio Issue #8861](https://github.com/gradio-app/gradio/issues/8861) confirms that `additional_inputs_accordion` malfunctions when `ChatInterface` is not the top-level block.
26
+
27
+ ---
28
+
29
+ ## 3. Solution Strategy: "The Unwrap Fix"
30
+
31
+ We will remove the redundant `gr.Blocks` wrapper. This restores the native behavior of `ChatInterface`, ensuring the accordion respects `open=False`.
32
+
33
+ ### Implementation Plan
34
+
35
+ **Refactor `src/app.py` / `create_demo()`**:
36
+
37
+ 1. **Remove** the `with gr.Blocks() as demo:` context manager.
38
+ 2. **Instantiate** `gr.ChatInterface` directly as the `demo` object.
39
+ 3. **Migrate UI Elements**:
40
+ * **Header**: Move the H1/Title text into the `title` parameter of `ChatInterface`.
41
+ * **Footer**: Move the footer text ("MCP Server Active...") into the `description` parameter. `ChatInterface` supports Markdown in `description`, making it the ideal place for static info below the title but above the chat.
42
+
43
+ ### Before (Buggy)
44
+ ```python
45
+ def create_demo():
46
+ with gr.Blocks() as demo: # <--- CAUSE OF BUG
47
+ gr.Markdown("# Title")
48
+ gr.ChatInterface(..., additional_inputs_accordion=gr.Accordion(open=False))
49
+ gr.Markdown("Footer")
50
+ return demo
51
+ ```
52
+
53
+ ### After (Correct)
54
+ ```python
55
+ def create_demo():
56
+ return gr.ChatInterface( # <--- FIX: Top-level component
57
+ ...,
58
+ title="🧬 DeepCritical",
59
+ description="*AI-Powered Drug Repurposing Agent...*\n\n---\n**MCP Server Active**...",
60
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False)
61
+ )
62
+ ```
63
+
64
+ ---
65
+
66
+ ## 4. Validation
67
+
68
+ 1. **Run**: `uv run python src/app.py`
69
+ 2. **Check**: Open `http://localhost:7860`
70
+ 3. **Verify**:
71
+ * Settings accordion starts **COLLAPSED**.
72
+ * Header title ("DeepCritical") is visible.
73
+ * Footer text ("MCP Server Active") is visible in the description area.
74
+ * Chat functionality works (Magentic/Simple modes).
75
+
76
+ ---
77
+
78
+ ## 5. Constraints & Notes
79
+
80
+ - **Layout**: We lose the ability to place arbitrary elements *below* the chat box (footer will move to top, under title), but this is an acceptable trade-off for a working UI.
81
+ - **CSS**: `ChatInterface` handles its own CSS; any custom class styling from the previous footer will be standardized to the description text style.
docs/bugs/PHASE_00_IMPLEMENTATION_ORDER.md DELETED
@@ -1,156 +0,0 @@
1
- # Phase 00: Implementation Order & Summary
2
-
3
- **Total Effort:** 5-8 hours
4
- **Parallelizable:** Yes (all 3 phases are independent)
5
-
6
- ---
7
-
8
- ## Executive Summary
9
-
10
- The DeepCritical drug repurposing agent produces garbage results because the search tools are broken:
11
-
12
- | Tool | Problem | Fix |
13
- |------|---------|-----|
14
- | BioRxiv | API doesn't support search | Replace with Europe PMC |
15
- | PubMed | Raw queries, no preprocessing | Add query cleaner |
16
- | ClinicalTrials | No filtering | Add status/type filters |
17
-
18
- **The Microsoft Agent Framework (Magentic) is working correctly.** The orchestration layer is fine. The data layer is broken.
19
-
20
- ---
21
-
22
- ## Phase Specs
23
-
24
- | Phase | Title | Effort | Priority | Dependencies |
25
- |-------|-------|--------|----------|--------------|
26
- | **01** | [Replace BioRxiv with Europe PMC](./PHASE_01_REPLACE_BIORXIV.md) | 2-3 hrs | P0 | None |
27
- | **02** | [PubMed Query Preprocessing](./PHASE_02_PUBMED_QUERY_PREPROCESSING.md) | 2-3 hrs | P0 | None |
28
- | **03** | [ClinicalTrials Filtering](./PHASE_03_CLINICALTRIALS_FILTERING.md) | 1-2 hrs | P1 | None |
29
-
30
- ---
31
-
32
- ## Recommended Execution Order
33
-
34
- Since all phases are independent, they can be done in parallel by different developers.
35
-
36
- **If doing sequentially, order by impact:**
37
-
38
- 1. **Phase 01** - BioRxiv is completely broken (returns random papers)
39
- 2. **Phase 02** - PubMed is partially broken (returns suboptimal results)
40
- 3. **Phase 03** - ClinicalTrials returns too much noise
41
-
42
- ---
43
-
44
- ## TDD Workflow (Per Phase)
45
-
46
- ```
47
- 1. Write failing tests
48
- 2. Run tests (confirm they fail)
49
- 3. Implement fix
50
- 4. Run tests (confirm they pass)
51
- 5. Run ALL tests (confirm no regressions)
52
- 6. Manual verification
53
- 7. Commit
54
- ```
55
-
56
- ---
57
-
58
- ## Verification After All Phases
59
-
60
- After completing all 3 phases, run this integration test:
61
-
62
- ```bash
63
- # Full system test
64
- uv run python -c "
65
- import asyncio
66
- from src.tools.europepmc import EuropePMCTool
67
- from src.tools.pubmed import PubMedTool
68
- from src.tools.clinicaltrials import ClinicalTrialsTool
69
-
70
- async def test_all():
71
- query = 'long covid treatment'
72
-
73
- print('=== Europe PMC (Preprints) ===')
74
- epmc = EuropePMCTool()
75
- results = await epmc.search(query, 2)
76
- for r in results:
77
- print(f' - {r.citation.title[:60]}...')
78
-
79
- print()
80
- print('=== PubMed ===')
81
- pm = PubMedTool()
82
- results = await pm.search(query, 2)
83
- for r in results:
84
- print(f' - {r.citation.title[:60]}...')
85
-
86
- print()
87
- print('=== ClinicalTrials.gov ===')
88
- ct = ClinicalTrialsTool()
89
- results = await ct.search(query, 2)
90
- for r in results:
91
- print(f' - {r.citation.title[:60]}...')
92
-
93
- asyncio.run(test_all())
94
- "
95
- ```
96
-
97
- **Expected:** All results should be relevant to "long covid treatment"
98
-
99
- ---
100
-
101
- ## Test Magentic Integration
102
-
103
- After all phases are complete, test the full Magentic workflow:
104
-
105
- ```bash
106
- # Test Magentic mode (requires OPENAI_API_KEY)
107
- uv run python -c "
108
- import asyncio
109
- from src.orchestrator_magentic import MagenticOrchestrator
110
-
111
- async def test_magentic():
112
- orchestrator = MagenticOrchestrator(max_rounds=3)
113
-
114
- print('Running Magentic workflow...')
115
- async for event in orchestrator.run('What drugs show promise for Long COVID?'):
116
- print(f'[{event.type}] {event.message[:100]}...')
117
-
118
- asyncio.run(test_magentic())
119
- "
120
- ```
121
-
122
- ---
123
-
124
- ## Files Changed (All Phases)
125
-
126
- | File | Phase | Action |
127
- |------|-------|--------|
128
- | `src/tools/europepmc.py` | 01 | CREATE |
129
- | `tests/unit/tools/test_europepmc.py` | 01 | CREATE |
130
- | `src/agents/tools.py` | 01 | MODIFY |
131
- | `src/tools/search_handler.py` | 01 | MODIFY |
132
- | `src/tools/biorxiv.py` | 01 | DELETE |
133
- | `tests/unit/tools/test_biorxiv.py` | 01 | DELETE |
134
- | `src/tools/query_utils.py` | 02 | CREATE |
135
- | `tests/unit/tools/test_query_utils.py` | 02 | CREATE |
136
- | `src/tools/pubmed.py` | 02 | MODIFY |
137
- | `src/tools/clinicaltrials.py` | 03 | MODIFY |
138
- | `tests/unit/tools/test_clinicaltrials.py` | 03 | MODIFY |
139
-
140
- ---
141
-
142
- ## Success Criteria (Overall)
143
-
144
- - [ ] All unit tests pass
145
- - [ ] All integration tests pass (real APIs)
146
- - [ ] Query "What drugs show promise for Long COVID?" returns relevant results from all 3 sources
147
- - [ ] Magentic workflow produces a coherent research report
148
- - [ ] No regressions in existing functionality
149
-
150
- ---
151
-
152
- ## Related Documentation
153
-
154
- - [P0 Critical Bugs](./P0_CRITICAL_BUGS.md) - Root cause analysis
155
- - [P0 Magentic Audit](./P0_MAGENTIC_AND_SEARCH_AUDIT.md) - Framework verification
156
- - [P0 Actionable Fixes](./P0_ACTIONABLE_FIXES.md) - Fix summaries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/bugs/PHASE_01_REPLACE_BIORXIV.md DELETED
@@ -1,371 +0,0 @@
1
- # Phase 01: Replace BioRxiv with Europe PMC
2
-
3
- **Priority:** P0 - Critical
4
- **Effort:** 2-3 hours
5
- **Dependencies:** None
6
-
7
- ---
8
-
9
- ## Problem Statement
10
-
11
- The BioRxiv API does not support keyword search. It only returns papers by date range, resulting in completely irrelevant results for any query.
12
-
13
- ## Success Criteria
14
-
15
- - [ ] `search_preprints("long covid treatment")` returns papers actually about Long COVID
16
- - [ ] All existing tests pass
17
- - [ ] New tests cover Europe PMC integration
18
-
19
- ---
20
-
21
- ## TDD Implementation Order
22
-
23
- ### Step 1: Write Failing Test
24
-
25
- **File:** `tests/unit/tools/test_europepmc.py`
26
-
27
- ```python
28
- """Unit tests for Europe PMC tool."""
29
-
30
- import pytest
31
- from unittest.mock import AsyncMock, patch
32
-
33
- from src.tools.europepmc import EuropePMCTool
34
- from src.utils.models import Evidence
35
-
36
-
37
- @pytest.mark.unit
38
- class TestEuropePMCTool:
39
- """Tests for EuropePMCTool."""
40
-
41
- @pytest.fixture
42
- def tool(self):
43
- return EuropePMCTool()
44
-
45
- def test_tool_name(self, tool):
46
- assert tool.name == "europepmc"
47
-
48
- @pytest.mark.asyncio
49
- async def test_search_returns_evidence(self, tool):
50
- """Test that search returns Evidence objects."""
51
- mock_response = {
52
- "resultList": {
53
- "result": [
54
- {
55
- "id": "12345",
56
- "title": "Long COVID Treatment Study",
57
- "abstractText": "This study examines treatments for Long COVID.",
58
- "doi": "10.1234/test",
59
- "pubYear": "2024",
60
- "source": "MED",
61
- "pubTypeList": {"pubType": ["research-article"]},
62
- }
63
- ]
64
- }
65
- }
66
-
67
- with patch("httpx.AsyncClient") as mock_client:
68
- mock_instance = AsyncMock()
69
- mock_client.return_value.__aenter__.return_value = mock_instance
70
- mock_instance.get.return_value.json.return_value = mock_response
71
- mock_instance.get.return_value.raise_for_status = lambda: None
72
-
73
- results = await tool.search("long covid treatment", max_results=5)
74
-
75
- assert len(results) == 1
76
- assert isinstance(results[0], Evidence)
77
- assert "Long COVID Treatment Study" in results[0].citation.title
78
-
79
- @pytest.mark.asyncio
80
- async def test_search_marks_preprints(self, tool):
81
- """Test that preprints are marked correctly."""
82
- mock_response = {
83
- "resultList": {
84
- "result": [
85
- {
86
- "id": "PPR12345",
87
- "title": "Preprint Study",
88
- "abstractText": "Abstract text",
89
- "doi": "10.1234/preprint",
90
- "pubYear": "2024",
91
- "source": "PPR",
92
- "pubTypeList": {"pubType": ["Preprint"]},
93
- }
94
- ]
95
- }
96
- }
97
-
98
- with patch("httpx.AsyncClient") as mock_client:
99
- mock_instance = AsyncMock()
100
- mock_client.return_value.__aenter__.return_value = mock_instance
101
- mock_instance.get.return_value.json.return_value = mock_response
102
- mock_instance.get.return_value.raise_for_status = lambda: None
103
-
104
- results = await tool.search("test", max_results=5)
105
-
106
- assert "[PREPRINT]" in results[0].content
107
- assert results[0].citation.source == "preprint"
108
-
109
- @pytest.mark.asyncio
110
- async def test_search_empty_results(self, tool):
111
- """Test handling of empty results."""
112
- mock_response = {"resultList": {"result": []}}
113
-
114
- with patch("httpx.AsyncClient") as mock_client:
115
- mock_instance = AsyncMock()
116
- mock_client.return_value.__aenter__.return_value = mock_instance
117
- mock_instance.get.return_value.json.return_value = mock_response
118
- mock_instance.get.return_value.raise_for_status = lambda: None
119
-
120
- results = await tool.search("nonexistent query xyz", max_results=5)
121
-
122
- assert results == []
123
-
124
-
125
- @pytest.mark.integration
126
- class TestEuropePMCIntegration:
127
- """Integration tests with real API."""
128
-
129
- @pytest.mark.asyncio
130
- async def test_real_api_call(self):
131
- """Test actual API returns relevant results."""
132
- tool = EuropePMCTool()
133
- results = await tool.search("long covid treatment", max_results=3)
134
-
135
- assert len(results) > 0
136
- # At least one result should mention COVID
137
- titles = " ".join([r.citation.title.lower() for r in results])
138
- assert "covid" in titles or "sars" in titles
139
- ```
140
-
141
- ### Step 2: Implement Europe PMC Tool
142
-
143
- **File:** `src/tools/europepmc.py`
144
-
145
- ```python
146
- """Europe PMC search tool - replaces BioRxiv."""
147
-
148
- from typing import Any
149
-
150
- import httpx
151
- from tenacity import retry, stop_after_attempt, wait_exponential
152
-
153
- from src.utils.exceptions import SearchError
154
- from src.utils.models import Citation, Evidence
155
-
156
-
157
- class EuropePMCTool:
158
- """
159
- Search Europe PMC for papers and preprints.
160
-
161
- Europe PMC indexes:
162
- - PubMed/MEDLINE articles
163
- - PMC full-text articles
164
- - Preprints from bioRxiv, medRxiv, ChemRxiv, etc.
165
- - Patents and clinical guidelines
166
-
167
- API Docs: https://europepmc.org/RestfulWebService
168
- """
169
-
170
- BASE_URL = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
171
-
172
- @property
173
- def name(self) -> str:
174
- return "europepmc"
175
-
176
- @retry(
177
- stop=stop_after_attempt(3),
178
- wait=wait_exponential(multiplier=1, min=1, max=10),
179
- reraise=True,
180
- )
181
- async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
182
- """
183
- Search Europe PMC for papers matching query.
184
-
185
- Args:
186
- query: Search keywords
187
- max_results: Maximum results to return
188
-
189
- Returns:
190
- List of Evidence objects
191
- """
192
- params = {
193
- "query": query,
194
- "resultType": "core",
195
- "pageSize": min(max_results, 100),
196
- "format": "json",
197
- }
198
-
199
- async with httpx.AsyncClient(timeout=30.0) as client:
200
- try:
201
- response = await client.get(self.BASE_URL, params=params)
202
- response.raise_for_status()
203
-
204
- data = response.json()
205
- results = data.get("resultList", {}).get("result", [])
206
-
207
- return [self._to_evidence(r) for r in results[:max_results]]
208
-
209
- except httpx.HTTPStatusError as e:
210
- raise SearchError(f"Europe PMC API error: {e}") from e
211
- except httpx.RequestError as e:
212
- raise SearchError(f"Europe PMC connection failed: {e}") from e
213
-
214
- def _to_evidence(self, result: dict[str, Any]) -> Evidence:
215
- """Convert Europe PMC result to Evidence."""
216
- title = result.get("title", "Untitled")
217
- abstract = result.get("abstractText", "No abstract available.")
218
- doi = result.get("doi", "")
219
- pub_year = result.get("pubYear", "Unknown")
220
-
221
- # Get authors
222
- author_list = result.get("authorList", {}).get("author", [])
223
- authors = [a.get("fullName", "") for a in author_list[:5] if a.get("fullName")]
224
-
225
- # Check if preprint
226
- pub_types = result.get("pubTypeList", {}).get("pubType", [])
227
- is_preprint = "Preprint" in pub_types
228
- source_db = result.get("source", "europepmc")
229
-
230
- # Build content
231
- preprint_marker = "[PREPRINT - Not peer-reviewed] " if is_preprint else ""
232
- content = f"{preprint_marker}{abstract[:1800]}"
233
-
234
- # Build URL
235
- if doi:
236
- url = f"https://doi.org/{doi}"
237
- elif result.get("pmid"):
238
- url = f"https://pubmed.ncbi.nlm.nih.gov/{result['pmid']}/"
239
- else:
240
- url = f"https://europepmc.org/article/{source_db}/{result.get('id', '')}"
241
-
242
- return Evidence(
243
- content=content[:2000],
244
- citation=Citation(
245
- source="preprint" if is_preprint else "europepmc",
246
- title=title[:500],
247
- url=url,
248
- date=str(pub_year),
249
- authors=authors,
250
- ),
251
- relevance=0.75 if is_preprint else 0.9,
252
- )
253
- ```
254
-
255
- ### Step 3: Update Magentic Tools
256
-
257
- **File:** `src/agents/tools.py` - Replace biorxiv import:
258
-
259
- ```python
260
- # REMOVE:
261
- # from src.tools.biorxiv import BioRxivTool
262
- # _biorxiv = BioRxivTool()
263
-
264
- # ADD:
265
- from src.tools.europepmc import EuropePMCTool
266
- _europepmc = EuropePMCTool()
267
-
268
- # UPDATE search_preprints function:
269
- @ai_function
270
- async def search_preprints(query: str, max_results: int = 10) -> str:
271
- """Search Europe PMC for preprints and papers.
272
-
273
- Use this tool to find the latest research including preprints
274
- from bioRxiv, medRxiv, and peer-reviewed papers.
275
-
276
- Args:
277
- query: Search terms (e.g., "long covid treatment")
278
- max_results: Maximum results to return (default 10)
279
-
280
- Returns:
281
- Formatted list of papers with abstracts and links
282
- """
283
- state = get_magentic_state()
284
-
285
- results = await _europepmc.search(query, max_results)
286
- if not results:
287
- return f"No papers found for: {query}"
288
-
289
- new_count = state.add_evidence(results)
290
-
291
- output = [f"Found {len(results)} papers ({new_count} new stored):\n"]
292
- for i, r in enumerate(results[:max_results], 1):
293
- title = r.citation.title
294
- date = r.citation.date
295
- source = r.citation.source
296
- content_clean = r.content[:300].replace("\n", " ")
297
- url = r.citation.url
298
-
299
- output.append(f"{i}. **{title}**")
300
- output.append(f" Source: {source} | Date: {date}")
301
- output.append(f" {content_clean}...")
302
- output.append(f" URL: {url}\n")
303
-
304
- return "\n".join(output)
305
- ```
306
-
307
- ### Step 4: Update Search Handler (Simple Mode)
308
-
309
- **File:** `src/tools/search_handler.py` - Update imports:
310
-
311
- ```python
312
- # REMOVE:
313
- # from src.tools.biorxiv import BioRxivTool
314
-
315
- # ADD:
316
- from src.tools.europepmc import EuropePMCTool
317
- ```
318
-
319
- ### Step 5: Delete Old BioRxiv Tests
320
-
321
- ```bash
322
- # After all new tests pass:
323
- rm tests/unit/tools/test_biorxiv.py
324
- ```
325
-
326
- ---
327
-
328
- ## Verification
329
-
330
- ```bash
331
- # Run new tests
332
- uv run pytest tests/unit/tools/test_europepmc.py -v
333
-
334
- # Run integration test (real API)
335
- uv run pytest tests/unit/tools/test_europepmc.py::TestEuropePMCIntegration -v
336
-
337
- # Run all tests to ensure no regressions
338
- uv run pytest tests/unit/ -v
339
-
340
- # Manual verification
341
- uv run python -c "
342
- import asyncio
343
- from src.tools.europepmc import EuropePMCTool
344
- tool = EuropePMCTool()
345
- results = asyncio.run(tool.search('long covid treatment', 3))
346
- for r in results:
347
- print(f'- {r.citation.title}')
348
- "
349
- ```
350
-
351
- ---
352
-
353
- ## Files Changed
354
-
355
- | File | Action |
356
- |------|--------|
357
- | `src/tools/europepmc.py` | CREATE |
358
- | `tests/unit/tools/test_europepmc.py` | CREATE |
359
- | `src/agents/tools.py` | MODIFY (replace biorxiv import) |
360
- | `src/tools/search_handler.py` | MODIFY (replace biorxiv import) |
361
- | `src/tools/biorxiv.py` | DELETE (after verification) |
362
- | `tests/unit/tools/test_biorxiv.py` | DELETE (after verification) |
363
-
364
- ---
365
-
366
- ## Rollback Plan
367
-
368
- If issues arise:
369
- 1. Revert `src/agents/tools.py` to use BioRxivTool
370
- 2. Revert `src/tools/search_handler.py`
371
- 3. Keep `europepmc.py` for future use
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/bugs/PHASE_02_PUBMED_QUERY_PREPROCESSING.md DELETED
@@ -1,355 +0,0 @@
1
- # Phase 02: PubMed Query Preprocessing
2
-
3
- **Priority:** P0 - Critical
4
- **Effort:** 2-3 hours
5
- **Dependencies:** None (can run parallel with Phase 01)
6
-
7
- ---
8
-
9
- ## Problem Statement
10
-
11
- PubMed receives raw natural language queries like "What medications show promise for Long COVID?" which include question words that pollute search results.
12
-
13
- ## Success Criteria
14
-
15
- - [ ] Question words stripped from queries
16
- - [ ] Medical synonyms expanded (Long COVID → PASC, etc.)
17
- - [ ] Relevant results returned for natural language questions
18
- - [ ] All existing tests pass
19
- - [ ] New tests cover query preprocessing
20
-
21
- ---
22
-
23
- ## TDD Implementation Order
24
-
25
- ### Step 1: Write Failing Tests
26
-
27
- **File:** `tests/unit/tools/test_query_utils.py`
28
-
29
- ```python
30
- """Unit tests for query preprocessing utilities."""
31
-
32
- import pytest
33
-
34
- from src.tools.query_utils import preprocess_query, expand_synonyms, strip_question_words
35
-
36
-
37
- @pytest.mark.unit
38
- class TestQueryPreprocessing:
39
- """Tests for query preprocessing."""
40
-
41
- def test_strip_question_words(self):
42
- """Test removal of question words."""
43
- assert strip_question_words("What drugs treat cancer") == "drugs treat cancer"
44
- assert strip_question_words("Which medications help diabetes") == "medications diabetes"
45
- assert strip_question_words("How can we cure alzheimer") == "cure alzheimer"
46
- assert strip_question_words("Is metformin effective") == "metformin effective"
47
-
48
- def test_strip_preserves_medical_terms(self):
49
- """Test that medical terms are preserved."""
50
- result = strip_question_words("What is the mechanism of metformin")
51
- assert "metformin" in result
52
- assert "mechanism" in result
53
-
54
- def test_expand_synonyms_long_covid(self):
55
- """Test Long COVID synonym expansion."""
56
- result = expand_synonyms("long covid treatment")
57
- assert "PASC" in result or "post-COVID" in result
58
-
59
- def test_expand_synonyms_alzheimer(self):
60
- """Test Alzheimer's synonym expansion."""
61
- result = expand_synonyms("alzheimer drug")
62
- assert "Alzheimer" in result
63
-
64
- def test_expand_synonyms_preserves_unknown(self):
65
- """Test that unknown terms are preserved."""
66
- result = expand_synonyms("metformin diabetes")
67
- assert "metformin" in result
68
- assert "diabetes" in result
69
-
70
- def test_preprocess_query_full_pipeline(self):
71
- """Test complete preprocessing pipeline."""
72
- raw = "What medications show promise for Long COVID?"
73
- result = preprocess_query(raw)
74
-
75
- # Should not contain question words
76
- assert "what" not in result.lower()
77
- assert "show" not in result.lower()
78
- assert "promise" not in result.lower()
79
-
80
- # Should contain expanded terms
81
- assert "PASC" in result or "post-COVID" in result or "long covid" in result.lower()
82
- assert "medications" in result.lower() or "drug" in result.lower()
83
-
84
- def test_preprocess_query_removes_punctuation(self):
85
- """Test that question marks are removed."""
86
- result = preprocess_query("Is metformin safe?")
87
- assert "?" not in result
88
-
89
- def test_preprocess_query_handles_empty(self):
90
- """Test handling of empty/whitespace queries."""
91
- assert preprocess_query("") == ""
92
- assert preprocess_query(" ") == ""
93
-
94
- def test_preprocess_query_already_clean(self):
95
- """Test that clean queries pass through."""
96
- clean = "metformin diabetes mechanism"
97
- result = preprocess_query(clean)
98
- assert "metformin" in result
99
- assert "diabetes" in result
100
- assert "mechanism" in result
101
- ```
102
-
103
- ### Step 2: Implement Query Utils
104
-
105
- **File:** `src/tools/query_utils.py`
106
-
107
- ```python
108
- """Query preprocessing utilities for biomedical search."""
109
-
110
- import re
111
- from typing import ClassVar
112
-
113
- # Question words and filler words to remove
114
- QUESTION_WORDS: set[str] = {
115
- # Question starters
116
- "what", "which", "how", "why", "when", "where", "who", "whom",
117
- # Auxiliary verbs in questions
118
- "is", "are", "was", "were", "do", "does", "did", "can", "could",
119
- "would", "should", "will", "shall", "may", "might",
120
- # Filler words in natural questions
121
- "show", "promise", "help", "believe", "think", "suggest",
122
- "possible", "potential", "effective", "useful", "good",
123
- # Articles (remove but less aggressively)
124
- "the", "a", "an",
125
- }
126
-
127
- # Medical synonym expansions
128
- SYNONYMS: dict[str, list[str]] = {
129
- "long covid": [
130
- "long COVID",
131
- "PASC",
132
- "post-acute sequelae of SARS-CoV-2",
133
- "post-COVID syndrome",
134
- "post-COVID-19 condition",
135
- ],
136
- "alzheimer": [
137
- "Alzheimer's disease",
138
- "Alzheimer disease",
139
- "AD",
140
- "Alzheimer dementia",
141
- ],
142
- "parkinson": [
143
- "Parkinson's disease",
144
- "Parkinson disease",
145
- "PD",
146
- ],
147
- "diabetes": [
148
- "diabetes mellitus",
149
- "type 2 diabetes",
150
- "T2DM",
151
- "diabetic",
152
- ],
153
- "cancer": [
154
- "cancer",
155
- "neoplasm",
156
- "tumor",
157
- "malignancy",
158
- "carcinoma",
159
- ],
160
- "heart disease": [
161
- "cardiovascular disease",
162
- "CVD",
163
- "coronary artery disease",
164
- "heart failure",
165
- ],
166
- }
167
-
168
-
169
- def strip_question_words(query: str) -> str:
170
- """
171
- Remove question words and filler terms from query.
172
-
173
- Args:
174
- query: Raw query string
175
-
176
- Returns:
177
- Query with question words removed
178
- """
179
- words = query.lower().split()
180
- filtered = [w for w in words if w not in QUESTION_WORDS]
181
- return " ".join(filtered)
182
-
183
-
184
- def expand_synonyms(query: str) -> str:
185
- """
186
- Expand medical terms to include synonyms.
187
-
188
- Args:
189
- query: Query string
190
-
191
- Returns:
192
- Query with synonym expansions in OR groups
193
- """
194
- result = query.lower()
195
-
196
- for term, expansions in SYNONYMS.items():
197
- if term in result:
198
- # Create OR group: ("term1" OR "term2" OR "term3")
199
- or_group = " OR ".join([f'"{exp}"' for exp in expansions])
200
- result = result.replace(term, f"({or_group})")
201
-
202
- return result
203
-
204
-
205
- def preprocess_query(raw_query: str) -> str:
206
- """
207
- Full preprocessing pipeline for PubMed queries.
208
-
209
- Pipeline:
210
- 1. Strip whitespace and punctuation
211
- 2. Remove question words
212
- 3. Expand medical synonyms
213
-
214
- Args:
215
- raw_query: Natural language query from user
216
-
217
- Returns:
218
- Optimized query for PubMed
219
- """
220
- if not raw_query or not raw_query.strip():
221
- return ""
222
-
223
- # Remove question marks and extra whitespace
224
- query = raw_query.replace("?", "").strip()
225
- query = re.sub(r"\s+", " ", query)
226
-
227
- # Strip question words
228
- query = strip_question_words(query)
229
-
230
- # Expand synonyms
231
- query = expand_synonyms(query)
232
-
233
- return query.strip()
234
- ```
235
-
236
- ### Step 3: Update PubMed Tool
237
-
238
- **File:** `src/tools/pubmed.py` - Add preprocessing:
239
-
240
- ```python
241
- # Add import at top:
242
- from src.tools.query_utils import preprocess_query
243
-
244
- # Update search method:
245
- @retry(
246
- stop=stop_after_attempt(3),
247
- wait=wait_exponential(multiplier=1, min=1, max=10),
248
- reraise=True,
249
- )
250
- async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
251
- """
252
- Search PubMed and return evidence.
253
- """
254
- await self._rate_limit()
255
-
256
- # PREPROCESS QUERY
257
- clean_query = preprocess_query(query)
258
- if not clean_query:
259
- clean_query = query # Fallback to original if preprocessing empties it
260
-
261
- async with httpx.AsyncClient(timeout=30.0) as client:
262
- search_params = self._build_params(
263
- db="pubmed",
264
- term=clean_query, # Use preprocessed query
265
- retmax=max_results,
266
- sort="relevance",
267
- )
268
- # ... rest unchanged
269
- ```
270
-
271
- ### Step 4: Update PubMed Tests
272
-
273
- **File:** `tests/unit/tools/test_pubmed.py` - Add preprocessing test:
274
-
275
- ```python
276
- @pytest.mark.asyncio
277
- async def test_search_preprocesses_query(self, pubmed_tool, mock_httpx_client):
278
- """Test that queries are preprocessed before search."""
279
- # This test verifies the integration - the actual preprocessing
280
- # is tested in test_query_utils.py
281
-
282
- mock_httpx_client.get.return_value = httpx.Response(
283
- 200,
284
- json={"esearchresult": {"idlist": []}},
285
- )
286
-
287
- # Natural language query
288
- await pubmed_tool.search("What drugs help with Long COVID?")
289
-
290
- # Verify the call was made (preprocessing happens internally)
291
- assert mock_httpx_client.get.called
292
- ```
293
-
294
- ---
295
-
296
- ## Verification
297
-
298
- ```bash
299
- # Run query utils tests
300
- uv run pytest tests/unit/tools/test_query_utils.py -v
301
-
302
- # Run pubmed tests
303
- uv run pytest tests/unit/tools/test_pubmed.py -v
304
-
305
- # Run all tests
306
- uv run pytest tests/unit/ -v
307
-
308
- # Manual verification
309
- uv run python -c "
310
- from src.tools.query_utils import preprocess_query
311
-
312
- queries = [
313
- 'What medications show promise for Long COVID?',
314
- 'Is metformin effective for cancer treatment?',
315
- 'How can we treat Alzheimer with existing drugs?',
316
- ]
317
-
318
- for q in queries:
319
- print(f'Input: {q}')
320
- print(f'Output: {preprocess_query(q)}')
321
- print()
322
- "
323
- ```
324
-
325
- Expected output:
326
- ```
327
- Input: What medications show promise for Long COVID?
328
- Output: medications ("long COVID" OR "PASC" OR "post-acute sequelae of SARS-CoV-2" OR "post-COVID syndrome" OR "post-COVID-19 condition")
329
-
330
- Input: Is metformin effective for cancer treatment?
331
- Output: metformin for ("cancer" OR "neoplasm" OR "tumor" OR "malignancy" OR "carcinoma") treatment
332
-
333
- Input: How can we treat Alzheimer with existing drugs?
334
- Output: we treat ("Alzheimer's disease" OR "Alzheimer disease" OR "AD" OR "Alzheimer dementia") with existing drugs
335
- ```
336
-
337
- ---
338
-
339
- ## Files Changed
340
-
341
- | File | Action |
342
- |------|--------|
343
- | `src/tools/query_utils.py` | CREATE |
344
- | `tests/unit/tools/test_query_utils.py` | CREATE |
345
- | `src/tools/pubmed.py` | MODIFY (add preprocessing) |
346
- | `tests/unit/tools/test_pubmed.py` | MODIFY (add integration test) |
347
-
348
- ---
349
-
350
- ## Future Enhancements (Out of Scope)
351
-
352
- - MeSH term lookup via NCBI API
353
- - Drug name normalization (brand → generic)
354
- - Disease ontology integration (UMLS)
355
- - Query intent classification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/bugs/PHASE_03_CLINICALTRIALS_FILTERING.md DELETED
@@ -1,386 +0,0 @@
1
- # Phase 03: ClinicalTrials.gov Filtering
2
-
3
- **Priority:** P1 - High
4
- **Effort:** 1-2 hours
5
- **Dependencies:** None (can run parallel with Phase 01 & 02)
6
-
7
- ---
8
-
9
- ## Problem Statement
10
-
11
- ClinicalTrials.gov returns ALL matching trials including:
12
- - Withdrawn/Terminated trials (no useful data)
13
- - Observational studies (not drug interventions)
14
- - Phase 1 trials (safety only, no efficacy)
15
-
16
- For drug repurposing, we need interventional studies with efficacy data.
17
-
18
- ## Success Criteria
19
-
20
- - [ ] Only interventional studies returned
21
- - [ ] Withdrawn/terminated trials filtered out
22
- - [ ] Phase information included in results
23
- - [ ] All existing tests pass
24
- - [ ] New tests cover filtering
25
-
26
- ---
27
-
28
- ## TDD Implementation Order
29
-
30
- ### Step 1: Write Failing Tests
31
-
32
- **File:** `tests/unit/tools/test_clinicaltrials.py` - Add filter tests:
33
-
34
- ```python
35
- """Unit tests for ClinicalTrials.gov tool."""
36
-
37
- import pytest
38
- from unittest.mock import patch, MagicMock
39
-
40
- from src.tools.clinicaltrials import ClinicalTrialsTool
41
- from src.utils.models import Evidence
42
-
43
-
44
- @pytest.mark.unit
45
- class TestClinicalTrialsTool:
46
- """Tests for ClinicalTrialsTool."""
47
-
48
- @pytest.fixture
49
- def tool(self):
50
- return ClinicalTrialsTool()
51
-
52
- def test_tool_name(self, tool):
53
- assert tool.name == "clinicaltrials"
54
-
55
- @pytest.mark.asyncio
56
- async def test_search_uses_filters(self, tool):
57
- """Test that search applies status and type filters."""
58
- mock_response = MagicMock()
59
- mock_response.json.return_value = {"studies": []}
60
- mock_response.raise_for_status = MagicMock()
61
-
62
- with patch("requests.get", return_value=mock_response) as mock_get:
63
- await tool.search("test query", max_results=5)
64
-
65
- # Verify filters were applied
66
- call_args = mock_get.call_args
67
- params = call_args.kwargs.get("params", call_args[1].get("params", {}))
68
-
69
- # Should filter for active/completed studies
70
- assert "filter.overallStatus" in params
71
- assert "COMPLETED" in params["filter.overallStatus"]
72
- assert "RECRUITING" in params["filter.overallStatus"]
73
-
74
- # Should filter for interventional studies
75
- assert "filter.studyType" in params
76
- assert "INTERVENTIONAL" in params["filter.studyType"]
77
-
78
- @pytest.mark.asyncio
79
- async def test_search_returns_evidence(self, tool):
80
- """Test that search returns Evidence objects."""
81
- mock_study = {
82
- "protocolSection": {
83
- "identificationModule": {
84
- "nctId": "NCT12345678",
85
- "briefTitle": "Metformin for Long COVID Treatment",
86
- },
87
- "statusModule": {
88
- "overallStatus": "COMPLETED",
89
- "startDateStruct": {"date": "2023-01-01"},
90
- },
91
- "descriptionModule": {
92
- "briefSummary": "A study examining metformin for Long COVID symptoms.",
93
- },
94
- "designModule": {
95
- "phases": ["PHASE2", "PHASE3"],
96
- },
97
- "conditionsModule": {
98
- "conditions": ["Long COVID", "PASC"],
99
- },
100
- "armsInterventionsModule": {
101
- "interventions": [{"name": "Metformin"}],
102
- },
103
- }
104
- }
105
-
106
- mock_response = MagicMock()
107
- mock_response.json.return_value = {"studies": [mock_study]}
108
- mock_response.raise_for_status = MagicMock()
109
-
110
- with patch("requests.get", return_value=mock_response):
111
- results = await tool.search("long covid metformin", max_results=5)
112
-
113
- assert len(results) == 1
114
- assert isinstance(results[0], Evidence)
115
- assert "Metformin" in results[0].citation.title
116
- assert "PHASE2" in results[0].content or "Phase" in results[0].content
117
-
118
- @pytest.mark.asyncio
119
- async def test_search_includes_phase_info(self, tool):
120
- """Test that phase information is included in content."""
121
- mock_study = {
122
- "protocolSection": {
123
- "identificationModule": {
124
- "nctId": "NCT12345678",
125
- "briefTitle": "Test Study",
126
- },
127
- "statusModule": {
128
- "overallStatus": "RECRUITING",
129
- "startDateStruct": {"date": "2024-01-01"},
130
- },
131
- "descriptionModule": {
132
- "briefSummary": "Test summary.",
133
- },
134
- "designModule": {
135
- "phases": ["PHASE3"],
136
- },
137
- "conditionsModule": {"conditions": ["Test"]},
138
- "armsInterventionsModule": {"interventions": []},
139
- }
140
- }
141
-
142
- mock_response = MagicMock()
143
- mock_response.json.return_value = {"studies": [mock_study]}
144
- mock_response.raise_for_status = MagicMock()
145
-
146
- with patch("requests.get", return_value=mock_response):
147
- results = await tool.search("test", max_results=5)
148
-
149
- # Phase should be in content
150
- assert "PHASE3" in results[0].content or "Phase 3" in results[0].content
151
-
152
- @pytest.mark.asyncio
153
- async def test_search_empty_results(self, tool):
154
- """Test handling of empty results."""
155
- mock_response = MagicMock()
156
- mock_response.json.return_value = {"studies": []}
157
- mock_response.raise_for_status = MagicMock()
158
-
159
- with patch("requests.get", return_value=mock_response):
160
- results = await tool.search("nonexistent xyz 12345", max_results=5)
161
- assert results == []
162
-
163
-
164
- @pytest.mark.integration
165
- class TestClinicalTrialsIntegration:
166
- """Integration tests with real API."""
167
-
168
- @pytest.mark.asyncio
169
- async def test_real_api_returns_interventional(self):
170
- """Test that real API returns interventional studies."""
171
- tool = ClinicalTrialsTool()
172
- results = await tool.search("long covid treatment", max_results=3)
173
-
174
- # Should get results
175
- assert len(results) > 0
176
-
177
- # Results should mention interventions or treatments
178
- all_content = " ".join([r.content.lower() for r in results])
179
- has_intervention = (
180
- "intervention" in all_content
181
- or "treatment" in all_content
182
- or "drug" in all_content
183
- or "phase" in all_content
184
- )
185
- assert has_intervention
186
- ```
187
-
188
- ### Step 2: Update ClinicalTrials Tool
189
-
190
- **File:** `src/tools/clinicaltrials.py` - Add filters:
191
-
192
- ```python
193
- """ClinicalTrials.gov search tool using API v2."""
194
-
195
- import asyncio
196
- from typing import Any, ClassVar
197
-
198
- import requests
199
- from tenacity import retry, stop_after_attempt, wait_exponential
200
-
201
- from src.utils.exceptions import SearchError
202
- from src.utils.models import Citation, Evidence
203
-
204
-
205
- class ClinicalTrialsTool:
206
- """Search tool for ClinicalTrials.gov.
207
-
208
- Note: Uses `requests` library instead of `httpx` because ClinicalTrials.gov's
209
- WAF blocks httpx's TLS fingerprint. The `requests` library is not blocked.
210
- See: https://clinicaltrials.gov/data-api/api
211
- """
212
-
213
- BASE_URL = "https://clinicaltrials.gov/api/v2/studies"
214
-
215
- # Fields to retrieve
216
- FIELDS: ClassVar[list[str]] = [
217
- "NCTId",
218
- "BriefTitle",
219
- "Phase",
220
- "OverallStatus",
221
- "Condition",
222
- "InterventionName",
223
- "StartDate",
224
- "BriefSummary",
225
- ]
226
-
227
- # Status filter: Only active/completed studies with potential data
228
- STATUS_FILTER = "COMPLETED|ACTIVE_NOT_RECRUITING|RECRUITING|ENROLLING_BY_INVITATION"
229
-
230
- # Study type filter: Only interventional (drug/treatment studies)
231
- STUDY_TYPE_FILTER = "INTERVENTIONAL"
232
-
233
- @property
234
- def name(self) -> str:
235
- return "clinicaltrials"
236
-
237
- @retry(
238
- stop=stop_after_attempt(3),
239
- wait=wait_exponential(multiplier=1, min=1, max=10),
240
- reraise=True,
241
- )
242
- async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
243
- """Search ClinicalTrials.gov for interventional studies.
244
-
245
- Args:
246
- query: Search query (e.g., "metformin alzheimer")
247
- max_results: Maximum results to return (max 100)
248
-
249
- Returns:
250
- List of Evidence objects from clinical trials
251
- """
252
- params: dict[str, str | int] = {
253
- "query.term": query,
254
- "pageSize": min(max_results, 100),
255
- "fields": "|".join(self.FIELDS),
256
- # FILTERS - Only interventional, active/completed studies
257
- "filter.overallStatus": self.STATUS_FILTER,
258
- "filter.studyType": self.STUDY_TYPE_FILTER,
259
- }
260
-
261
- try:
262
- # Run blocking requests.get in a separate thread for async compatibility
263
- response = await asyncio.to_thread(
264
- requests.get,
265
- self.BASE_URL,
266
- params=params,
267
- headers={"User-Agent": "DeepCritical-Research-Agent/1.0"},
268
- timeout=30,
269
- )
270
- response.raise_for_status()
271
-
272
- data = response.json()
273
- studies = data.get("studies", [])
274
- return [self._study_to_evidence(study) for study in studies[:max_results]]
275
-
276
- except requests.HTTPError as e:
277
- raise SearchError(f"ClinicalTrials.gov API error: {e}") from e
278
- except requests.RequestException as e:
279
- raise SearchError(f"ClinicalTrials.gov request failed: {e}") from e
280
-
281
- def _study_to_evidence(self, study: dict[str, Any]) -> Evidence:
282
- """Convert a clinical trial study to Evidence."""
283
- # Navigate nested structure
284
- protocol = study.get("protocolSection", {})
285
- id_module = protocol.get("identificationModule", {})
286
- status_module = protocol.get("statusModule", {})
287
- desc_module = protocol.get("descriptionModule", {})
288
- design_module = protocol.get("designModule", {})
289
- conditions_module = protocol.get("conditionsModule", {})
290
- arms_module = protocol.get("armsInterventionsModule", {})
291
-
292
- nct_id = id_module.get("nctId", "Unknown")
293
- title = id_module.get("briefTitle", "Untitled Study")
294
- status = status_module.get("overallStatus", "Unknown")
295
- start_date = status_module.get("startDateStruct", {}).get("date", "Unknown")
296
-
297
- # Get phase (might be a list)
298
- phases = design_module.get("phases", [])
299
- phase = phases[0] if phases else "Not Applicable"
300
-
301
- # Get conditions
302
- conditions = conditions_module.get("conditions", [])
303
- conditions_str = ", ".join(conditions[:3]) if conditions else "Unknown"
304
-
305
- # Get interventions
306
- interventions = arms_module.get("interventions", [])
307
- intervention_names = [i.get("name", "") for i in interventions[:3]]
308
- interventions_str = ", ".join(intervention_names) if intervention_names else "Unknown"
309
-
310
- # Get summary
311
- summary = desc_module.get("briefSummary", "No summary available.")
312
-
313
- # Build content with key trial info
314
- content = (
315
- f"{summary[:500]}... "
316
- f"Trial Phase: {phase}. "
317
- f"Status: {status}. "
318
- f"Conditions: {conditions_str}. "
319
- f"Interventions: {interventions_str}."
320
- )
321
-
322
- return Evidence(
323
- content=content[:2000],
324
- citation=Citation(
325
- source="clinicaltrials",
326
- title=title[:500],
327
- url=f"https://clinicaltrials.gov/study/{nct_id}",
328
- date=start_date,
329
- authors=[], # Trials don't have traditional authors
330
- ),
331
- relevance=0.85, # Trials are highly relevant for repurposing
332
- )
333
- ```
334
-
335
- ---
336
-
337
- ## Verification
338
-
339
- ```bash
340
- # Run clinicaltrials tests
341
- uv run pytest tests/unit/tools/test_clinicaltrials.py -v
342
-
343
- # Run integration test (real API)
344
- uv run pytest tests/unit/tools/test_clinicaltrials.py::TestClinicalTrialsIntegration -v
345
-
346
- # Run all tests
347
- uv run pytest tests/unit/ -v
348
-
349
- # Manual verification
350
- uv run python -c "
351
- import asyncio
352
- from src.tools.clinicaltrials import ClinicalTrialsTool
353
-
354
- tool = ClinicalTrialsTool()
355
- results = asyncio.run(tool.search('long covid treatment', 3))
356
-
357
- for r in results:
358
- print(f'Title: {r.citation.title}')
359
- print(f'Content: {r.content[:200]}...')
360
- print()
361
- "
362
- ```
363
-
364
- ---
365
-
366
- ## Files Changed
367
-
368
- | File | Action |
369
- |------|--------|
370
- | `src/tools/clinicaltrials.py` | MODIFY (add filters) |
371
- | `tests/unit/tools/test_clinicaltrials.py` | MODIFY (add filter tests) |
372
-
373
- ---
374
-
375
- ## API Filter Reference
376
-
377
- ClinicalTrials.gov API v2 supports these filters:
378
-
379
- | Parameter | Values | Purpose |
380
- |-----------|--------|---------|
381
- | `filter.overallStatus` | COMPLETED, RECRUITING, etc. | Trial status |
382
- | `filter.studyType` | INTERVENTIONAL, OBSERVATIONAL | Study design |
383
- | `filter.phase` | PHASE1, PHASE2, PHASE3, PHASE4 | Trial phase |
384
- | `filter.geo` | Country codes | Geographic filter |
385
-
386
- See: https://clinicaltrials.gov/data-api/api
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/rate_limiting_demo.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Demo script to verify rate limiting works correctly."""
3
+
4
+ import asyncio
5
+ import time
6
+
7
+ from src.tools.pubmed import PubMedTool
8
+ from src.tools.rate_limiter import RateLimiter, get_pubmed_limiter, reset_pubmed_limiter
9
+
10
+
11
+ async def test_basic_limiter():
12
+ """Test basic rate limiter behavior."""
13
+ print("=" * 60)
14
+ print("Rate Limiting Demo")
15
+ print("=" * 60)
16
+
17
+ # Test 1: Basic limiter
18
+ print("\n[Test 1] Testing 3/second limiter...")
19
+ limiter = RateLimiter("3/second")
20
+
21
+ start = time.monotonic()
22
+ for i in range(6):
23
+ await limiter.acquire()
24
+ elapsed = time.monotonic() - start
25
+ print(f" Request {i+1} at {elapsed:.2f}s")
26
+
27
+ total = time.monotonic() - start
28
+ print(f" Total time for 6 requests: {total:.2f}s (expected ~2s)")
29
+
30
+
31
+ async def test_pubmed_limiter():
32
+ """Test PubMed-specific limiter."""
33
+ print("\n[Test 2] Testing PubMed limiter (shared)...")
34
+
35
+ reset_pubmed_limiter() # Clean state
36
+
37
+ # Without API key: 3/sec
38
+ limiter = get_pubmed_limiter(api_key=None)
39
+ print(f" Rate without key: {limiter.rate}")
40
+
41
+ # Multiple tools should share the same limiter
42
+ tool1 = PubMedTool()
43
+ tool2 = PubMedTool()
44
+
45
+ # Verify they share the limiter
46
+ print(f" Tools share limiter: {tool1._limiter is tool2._limiter}")
47
+
48
+
49
+ async def test_concurrent_requests():
50
+ """Test rate limiting under concurrent load."""
51
+ print("\n[Test 3] Testing concurrent request limiting...")
52
+
53
+ limiter = RateLimiter("5/second")
54
+
55
+ async def make_request(i: int):
56
+ await limiter.acquire()
57
+ return time.monotonic()
58
+
59
+ start = time.monotonic()
60
+ # Launch 10 concurrent requests
61
+ tasks = [make_request(i) for i in range(10)]
62
+ times = await asyncio.gather(*tasks)
63
+
64
+ # Calculate distribution
65
+ relative_times = [t - start for t in times]
66
+ print(f" Request times: {[f'{t:.2f}s' for t in sorted(relative_times)]}")
67
+
68
+ total = max(relative_times)
69
+ print(f" All 10 requests completed in {total:.2f}s (expected ~2s)")
70
+
71
+
72
+ async def main():
73
+ await test_basic_limiter()
74
+ await test_pubmed_limiter()
75
+ await test_concurrent_requests()
76
+
77
+ print("\n" + "=" * 60)
78
+ print("Demo complete!")
79
+
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
pyproject.toml CHANGED
@@ -25,6 +25,8 @@ dependencies = [
25
  "structlog>=24.1", # Structured logging
26
  "requests>=2.32.5", # ClinicalTrials.gov (httpx blocked by WAF)
27
  "pydantic-graph>=1.22.0",
 
 
28
  ]
29
 
30
  [project.optional-dependencies]
@@ -44,7 +46,7 @@ dev = [
44
  "pre-commit>=3.7",
45
  ]
46
  magentic = [
47
- "agent-framework-core>=1.0.0b251120,<2.0.0", # Pin to avoid breaking changes
48
  ]
49
  embeddings = [
50
  "chromadb>=0.4.0",
 
25
  "structlog>=24.1", # Structured logging
26
  "requests>=2.32.5", # ClinicalTrials.gov (httpx blocked by WAF)
27
  "pydantic-graph>=1.22.0",
28
+ "limits>=3.0", # Rate limiting
29
+ "duckduckgo-search>=5.0", # Web search
30
  ]
31
 
32
  [project.optional-dependencies]
 
46
  "pre-commit>=3.7",
47
  ]
48
  magentic = [
49
+ "agent-framework-core>=1.0.0b251120,<2.0.0", # Microsoft Agent Framework (PyPI)
50
  ]
51
  embeddings = [
52
  "chromadb>=0.4.0",
requirements.txt CHANGED
@@ -7,6 +7,12 @@ pydantic-ai>=0.0.16
7
  openai>=1.0.0
8
  anthropic>=0.18.0
9
 
 
 
 
 
 
 
10
  # HTTP & Parsing
11
  httpx>=0.27
12
  beautifulsoup4>=4.12
@@ -20,6 +26,7 @@ python-dotenv>=1.0
20
  tenacity>=8.2
21
  structlog>=24.1
22
  requests>=2.32.5
 
23
 
24
  # Optional: Modal for code execution
25
  modal>=0.63.0
 
7
  openai>=1.0.0
8
  anthropic>=0.18.0
9
 
10
+ # Multi-agent orchestration (Advanced mode)
11
+ agent-framework-core>=1.0.0b251120
12
+
13
+ # Web search
14
+ duckduckgo-search>=5.0
15
+
16
  # HTTP & Parsing
17
  httpx>=0.27
18
  beautifulsoup4>=4.12
 
26
  tenacity>=8.2
27
  structlog>=24.1
28
  requests>=2.32.5
29
+ limits>=3.0 # Rate limiting (Phase 17)
30
 
31
  # Optional: Modal for code execution
32
  modal>=0.63.0
src/agent_factory/judges.py CHANGED
@@ -8,8 +8,10 @@ import structlog
8
  from huggingface_hub import InferenceClient
9
  from pydantic_ai import Agent
10
  from pydantic_ai.models.anthropic import AnthropicModel
 
11
  from pydantic_ai.models.openai import OpenAIModel
12
  from pydantic_ai.providers.anthropic import AnthropicProvider
 
13
  from pydantic_ai.providers.openai import OpenAIProvider
14
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
15
 
@@ -36,6 +38,12 @@ def get_model() -> Any:
36
  provider = AnthropicProvider(api_key=settings.anthropic_api_key)
37
  return AnthropicModel(settings.anthropic_model, provider=provider)
38
 
 
 
 
 
 
 
39
  if llm_provider != "openai":
40
  logger.warning("Unknown LLM provider, defaulting to OpenAI", provider=llm_provider)
41
 
@@ -434,7 +442,7 @@ class MockJudgeHandler:
434
  clinical_evidence_score=clinical_score,
435
  clinical_reasoning=(
436
  f"Demo mode: {evidence_count} sources retrieved from PubMed, "
437
- "ClinicalTrials.gov, and bioRxiv. Full analysis requires LLM API key."
438
  ),
439
  drug_candidates=drug_candidates,
440
  key_findings=key_findings,
 
8
  from huggingface_hub import InferenceClient
9
  from pydantic_ai import Agent
10
  from pydantic_ai.models.anthropic import AnthropicModel
11
+ from pydantic_ai.models.huggingface import HuggingFaceModel
12
  from pydantic_ai.models.openai import OpenAIModel
13
  from pydantic_ai.providers.anthropic import AnthropicProvider
14
+ from pydantic_ai.providers.huggingface import HuggingFaceProvider
15
  from pydantic_ai.providers.openai import OpenAIProvider
16
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
17
 
 
38
  provider = AnthropicProvider(api_key=settings.anthropic_api_key)
39
  return AnthropicModel(settings.anthropic_model, provider=provider)
40
 
41
+ if llm_provider == "huggingface":
42
+ # Free tier - uses HF_TOKEN from environment if available
43
+ model_name = settings.huggingface_model or "meta-llama/Llama-3.1-70B-Instruct"
44
+ hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
45
+ return HuggingFaceModel(model_name, provider=hf_provider)
46
+
47
  if llm_provider != "openai":
48
  logger.warning("Unknown LLM provider, defaulting to OpenAI", provider=llm_provider)
49
 
 
442
  clinical_evidence_score=clinical_score,
443
  clinical_reasoning=(
444
  f"Demo mode: {evidence_count} sources retrieved from PubMed, "
445
+ "ClinicalTrials.gov, and Europe PMC. Full analysis requires LLM API key."
446
  ),
447
  drug_candidates=drug_candidates,
448
  key_findings=key_findings,
src/agents/code_executor_agent.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Code execution agent using Modal."""
2
+
3
+ import asyncio
4
+
5
+ import structlog
6
+ from agent_framework import ChatAgent, ai_function
7
+ from agent_framework.openai import OpenAIChatClient
8
+
9
+ from src.tools.code_execution import get_code_executor
10
+ from src.utils.config import settings
11
+
12
+ logger = structlog.get_logger()
13
+
14
+
15
+ @ai_function # type: ignore[arg-type, misc]
16
+ async def execute_python_code(code: str) -> str:
17
+ """Execute Python code in a secure sandbox.
18
+
19
+ Args:
20
+ code: The Python code to execute.
21
+
22
+ Returns:
23
+ The standard output and standard error of the execution.
24
+ """
25
+ logger.info("Code execution starting", code_length=len(code))
26
+ executor = get_code_executor()
27
+ loop = asyncio.get_running_loop()
28
+
29
+ # Run in executor to avoid blocking
30
+ try:
31
+ result = await loop.run_in_executor(None, lambda: executor.execute(code))
32
+ if result["success"]:
33
+ logger.info("Code execution succeeded")
34
+ return f"Stdout:\n{result['stdout']}"
35
+ else:
36
+ logger.warning("Code execution failed", error=result.get("error"))
37
+ return f"Error:\n{result['error']}\nStderr:\n{result['stderr']}"
38
+ except Exception as e:
39
+ logger.error("Code execution exception", error=str(e))
40
+ return f"Execution failed: {e}"
41
+
42
+
43
+ def create_code_executor_agent(chat_client: OpenAIChatClient | None = None) -> ChatAgent:
44
+ """Create a code executor agent.
45
+
46
+ Args:
47
+ chat_client: Optional custom chat client.
48
+
49
+ Returns:
50
+ ChatAgent configured for code execution.
51
+ """
52
+ client = chat_client or OpenAIChatClient(
53
+ model_id=settings.openai_model,
54
+ api_key=settings.openai_api_key,
55
+ )
56
+
57
+ return ChatAgent(
58
+ name="CodeExecutorAgent",
59
+ description="Executes Python code for data analysis, calculation, and simulation.",
60
+ instructions="""You are a code execution expert.
61
+ When asked to analyze data or perform calculations, write Python code and execute it.
62
+ Use libraries like pandas, numpy, scipy, matplotlib.
63
+
64
+ Always output the code you want to execute using the `execute_python_code` tool.
65
+ Check the output and interpret the results.""",
66
+ chat_client=client,
67
+ tools=[execute_python_code],
68
+ temperature=0.0, # Strict code generation
69
+ )
src/agents/judge_agent_llm.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM Judge for sub-iterations."""
2
+
3
+ from typing import Any
4
+
5
+ import structlog
6
+ from pydantic_ai import Agent
7
+
8
+ from src.agent_factory.judges import get_model
9
+ from src.utils.models import JudgeAssessment
10
+
11
+ logger = structlog.get_logger()
12
+
13
+
14
+ class LLMSubIterationJudge:
15
+ """Judge that uses an LLM to assess sub-iteration results."""
16
+
17
+ def __init__(self) -> None:
18
+ self.model = get_model()
19
+ self.agent = Agent(
20
+ model=self.model,
21
+ output_type=JudgeAssessment,
22
+ system_prompt="""You are a strict judge evaluating a research task.
23
+
24
+ Evaluate if the result is sufficient to answer the task.
25
+ Provide scores and detailed reasoning.
26
+ If not sufficient, suggest next steps.""",
27
+ retries=3,
28
+ )
29
+
30
+ async def assess(self, task: str, result: Any, history: list[Any]) -> JudgeAssessment:
31
+ """Assess the result using LLM."""
32
+ logger.info("LLM judge assessing result", task=task[:100], history_len=len(history))
33
+
34
+ prompt = f"""Task: {task}
35
+
36
+ Current Result:
37
+ {str(result)[:4000]}
38
+
39
+ History of previous attempts: {len(history)}
40
+
41
+ Evaluate validity and sufficiency."""
42
+
43
+ run_result = await self.agent.run(prompt)
44
+ logger.info("LLM judge assessment complete", sufficient=run_result.output.sufficient)
45
+ return run_result.output
src/agents/magentic_agents.py CHANGED
@@ -29,7 +29,7 @@ def create_search_agent(chat_client: OpenAIChatClient | None = None) -> ChatAgen
29
  return ChatAgent(
30
  name="SearchAgent",
31
  description=(
32
- "Searches biomedical databases (PubMed, ClinicalTrials.gov, bioRxiv) "
33
  "for drug repurposing evidence"
34
  ),
35
  instructions="""You are a biomedical search specialist. When asked to find evidence:
 
29
  return ChatAgent(
30
  name="SearchAgent",
31
  description=(
32
+ "Searches biomedical databases (PubMed, ClinicalTrials.gov, Europe PMC) "
33
  "for drug repurposing evidence"
34
  ),
35
  instructions="""You are a biomedical search specialist. When asked to find evidence:
src/agents/retrieval_agent.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Retrieval agent for web search and context management."""
2
+
3
+ import structlog
4
+ from agent_framework import ChatAgent, ai_function
5
+ from agent_framework.openai import OpenAIChatClient
6
+
7
+ from src.state import get_magentic_state
8
+ from src.tools.web_search import WebSearchTool
9
+ from src.utils.config import settings
10
+
11
+ logger = structlog.get_logger()
12
+
13
+ _web_search = WebSearchTool()
14
+
15
+
16
+ @ai_function # type: ignore[arg-type, misc]
17
+ async def search_web(query: str, max_results: int = 10) -> str:
18
+ """Search the web using DuckDuckGo.
19
+
20
+ Args:
21
+ query: Search keywords.
22
+ max_results: Maximum results to return (default 10).
23
+
24
+ Returns:
25
+ Formatted search results.
26
+ """
27
+ logger.info("Web search starting", query=query, max_results=max_results)
28
+ state = get_magentic_state()
29
+
30
+ results = await _web_search.search(query, max_results)
31
+ if not results.evidence:
32
+ logger.info("Web search returned no results", query=query)
33
+ return f"No web results found for: {query}"
34
+
35
+ # Update state
36
+ # We add *all* found results to state
37
+ new_count = state.add_evidence(results.evidence)
38
+ logger.info(
39
+ "Web search complete",
40
+ query=query,
41
+ results_found=len(results.evidence),
42
+ new_evidence=new_count,
43
+ )
44
+
45
+ # Use embedding service for deduplication/indexing if available
46
+ if state.embedding_service:
47
+ # This method also adds to vector DB as a side effect for unique items
48
+ await state.embedding_service.deduplicate(results.evidence)
49
+
50
+ output = [f"Found {len(results.evidence)} web results ({new_count} new stored):\n"]
51
+ for i, r in enumerate(results.evidence[:max_results], 1):
52
+ output.append(f"{i}. **{r.citation.title}**")
53
+ output.append(f" Source: {r.citation.url}")
54
+ output.append(f" {r.content[:300]}...\n")
55
+
56
+ return "\n".join(output)
57
+
58
+
59
+ def create_retrieval_agent(chat_client: OpenAIChatClient | None = None) -> ChatAgent:
60
+ """Create a retrieval agent.
61
+
62
+ Args:
63
+ chat_client: Optional custom chat client.
64
+
65
+ Returns:
66
+ ChatAgent configured for retrieval.
67
+ """
68
+ client = chat_client or OpenAIChatClient(
69
+ model_id=settings.openai_model,
70
+ api_key=settings.openai_api_key,
71
+ )
72
+
73
+ return ChatAgent(
74
+ name="RetrievalAgent",
75
+ description="Searches the web and manages context/evidence.",
76
+ instructions="""You are a retrieval specialist.
77
+ Use `search_web` to find information on the internet.
78
+ Your goal is to gather relevant evidence for the research task.
79
+ Always summarize what you found.""",
80
+ chat_client=client,
81
+ tools=[search_web],
82
+ )
src/app.py CHANGED
@@ -31,7 +31,7 @@ def configure_orchestrator(
31
 
32
  Args:
33
  use_mock: If True, use MockJudgeHandler (no API key needed)
34
- mode: Orchestrator mode ("simple" or "magentic")
35
  user_api_key: Optional user-provided API key (BYOK)
36
  api_provider: API provider ("openai" or "anthropic")
37
 
@@ -115,7 +115,7 @@ async def research_agent(
115
  Args:
116
  message: User's research question
117
  history: Chat history (Gradio format)
118
- mode: Orchestrator mode ("simple" or "magentic")
119
  api_key: Optional user-provided API key (BYOK - Bring Your Own Key)
120
  api_provider: API provider ("openai" or "anthropic")
121
 
@@ -135,10 +135,11 @@ async def research_agent(
135
  has_user_key = bool(user_api_key)
136
  has_paid_key = has_openai or has_anthropic or has_user_key
137
 
138
- # Magentic mode requires OpenAI specifically
139
- if mode == "magentic" and not (has_openai or (has_user_key and api_provider == "openai")):
140
  yield (
141
- "⚠️ **Warning**: Magentic mode requires OpenAI API key. Falling back to simple mode.\n\n"
 
142
  )
143
  mode = "simple"
144
 
@@ -186,78 +187,68 @@ async def research_agent(
186
  yield f"❌ **Error**: {e!s}"
187
 
188
 
189
- def create_demo() -> Any:
190
  """
191
  Create the Gradio demo interface with MCP support.
192
 
193
  Returns:
194
  Configured Gradio Blocks interface with MCP server enabled
195
  """
196
- with gr.Blocks(
197
- title="DeepCritical - Drug Repurposing Research Agent",
198
- ) as demo:
199
- # 1. Minimal Header (Option A: 2 lines max)
200
- gr.Markdown(
201
- "# 🧬 DeepCritical\n"
202
- "*AI-Powered Drug Repurposing Agent — searches PubMed, ClinicalTrials.gov & bioRxiv*"
203
- )
204
-
205
- # 2. Main Chat Interface
206
- # Config inputs will be in a collapsed accordion below the chat input
207
- gr.ChatInterface(
208
- fn=research_agent,
209
- examples=[
210
- [
211
- "What drugs could be repurposed for Alzheimer's disease?",
212
- "simple",
213
- "",
214
- "openai",
215
- ],
216
- [
217
- "Is metformin effective for treating cancer?",
218
- "simple",
219
- "",
220
- "openai",
221
- ],
222
- [
223
- "What medications show promise for Long COVID treatment?",
224
- "simple",
225
- "",
226
- "openai",
227
- ],
228
  ],
229
- additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False),
230
- additional_inputs=[
231
- gr.Radio(
232
- choices=["simple", "magentic"],
233
- value="simple",
234
- label="Orchestrator Mode",
235
- info="Simple: Linear | Magentic: Multi-Agent (OpenAI)",
236
- ),
237
- gr.Textbox(
238
- label="🔑 API Key (Optional - BYOK)",
239
- placeholder="sk-... or sk-ant-...",
240
- type="password",
241
- info="Enter your own API key. Never stored.",
242
- ),
243
- gr.Radio(
244
- choices=["openai", "anthropic"],
245
- value="openai",
246
- label="API Provider",
247
- info="Select the provider for your API key",
248
- ),
249
  ],
250
- )
251
-
252
- # 3. Minimal Footer (Option C: Remove MCP Tabs, keep info)
253
- gr.Markdown(
254
- """
255
- ---
256
- *Research tool only — not for medical advice.*
257
- **MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`
258
- """,
259
- elem_classes=["footer"],
260
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  return demo
263
 
 
31
 
32
  Args:
33
  use_mock: If True, use MockJudgeHandler (no API key needed)
34
+ mode: Orchestrator mode ("simple" or "advanced")
35
  user_api_key: Optional user-provided API key (BYOK)
36
  api_provider: API provider ("openai" or "anthropic")
37
 
 
115
  Args:
116
  message: User's research question
117
  history: Chat history (Gradio format)
118
+ mode: Orchestrator mode ("simple" or "advanced")
119
  api_key: Optional user-provided API key (BYOK - Bring Your Own Key)
120
  api_provider: API provider ("openai" or "anthropic")
121
 
 
135
  has_user_key = bool(user_api_key)
136
  has_paid_key = has_openai or has_anthropic or has_user_key
137
 
138
+ # Advanced mode requires OpenAI specifically (due to agent-framework binding)
139
+ if mode == "advanced" and not (has_openai or (has_user_key and api_provider == "openai")):
140
  yield (
141
+ "⚠️ **Warning**: Advanced mode currently requires OpenAI API key. "
142
+ "Falling back to simple mode.\n\n"
143
  )
144
  mode = "simple"
145
 
 
187
  yield f"❌ **Error**: {e!s}"
188
 
189
 
190
+ def create_demo() -> gr.ChatInterface:
191
  """
192
  Create the Gradio demo interface with MCP support.
193
 
194
  Returns:
195
  Configured Gradio Blocks interface with MCP server enabled
196
  """
197
+ # 1. Unwrapped ChatInterface (Fixes Accordion Bug)
198
+ demo = gr.ChatInterface(
199
+ fn=research_agent,
200
+ title="🧬 DeepCritical",
201
+ description=(
202
+ "*AI-Powered Drug Repurposing Agent — searches PubMed, "
203
+ "ClinicalTrials.gov & Europe PMC*\n\n"
204
+ "---\n"
205
+ "*Research tool only — not for medical advice.* \n"
206
+ "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`"
207
+ ),
208
+ examples=[
209
+ [
210
+ "What drugs could be repurposed for Alzheimer's disease?",
211
+ "simple",
212
+ "",
213
+ "openai",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  ],
215
+ [
216
+ "Is metformin effective for treating cancer?",
217
+ "simple",
218
+ "",
219
+ "openai",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  ],
221
+ [
222
+ "What medications show promise for Long COVID treatment?",
223
+ "simple",
224
+ "",
225
+ "openai",
226
+ ],
227
+ ],
228
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False),
229
+ additional_inputs=[
230
+ gr.Radio(
231
+ choices=["simple", "advanced"],
232
+ value="simple",
233
+ label="Orchestrator Mode",
234
+ info=(
235
+ "Simple: Linear (Free Tier Friendly) | Advanced: Multi-Agent (Requires OpenAI)"
236
+ ),
237
+ ),
238
+ gr.Textbox(
239
+ label="🔑 API Key (Optional - BYOK)",
240
+ placeholder="sk-... or sk-ant-...",
241
+ type="password",
242
+ info="Enter your own API key. Never stored.",
243
+ ),
244
+ gr.Radio(
245
+ choices=["openai", "anthropic"],
246
+ value="openai",
247
+ label="API Provider",
248
+ info="Select the provider for your API key",
249
+ ),
250
+ ],
251
+ )
252
 
253
  return demo
254
 
src/middleware/sub_iteration.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Middleware for orchestrating sub-iterations with research teams and judges."""
2
+
3
+ from typing import Any, Protocol
4
+
5
+ import structlog
6
+
7
+ from src.utils.models import AgentEvent, JudgeAssessment
8
+
9
+ logger = structlog.get_logger()
10
+
11
+
12
+ class SubIterationTeam(Protocol):
13
+ """Protocol for a research team that executes a sub-task."""
14
+
15
+ async def execute(self, task: str) -> Any:
16
+ """Execute the sub-task and return a result."""
17
+ ...
18
+
19
+
20
+ class SubIterationJudge(Protocol):
21
+ """Protocol for a judge that evaluates the sub-task result."""
22
+
23
+ async def assess(self, task: str, result: Any, history: list[Any]) -> JudgeAssessment:
24
+ """Assess the quality of the result."""
25
+ ...
26
+
27
+
28
+ class SubIterationMiddleware:
29
+ """
30
+ Middleware that manages a sub-iteration loop:
31
+ 1. Orchestrator delegates to a Research Team.
32
+ 2. Research Team produces a result.
33
+ 3. Judge evaluates the result.
34
+ 4. Loop continues until Judge approves or max iterations reached.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ team: SubIterationTeam,
40
+ judge: SubIterationJudge,
41
+ max_iterations: int = 3,
42
+ ):
43
+ self.team = team
44
+ self.judge = judge
45
+ self.max_iterations = max_iterations
46
+
47
+ async def run(
48
+ self,
49
+ task: str,
50
+ event_callback: Any = None, # Optional callback for streaming events
51
+ ) -> tuple[Any, JudgeAssessment | None]:
52
+ """
53
+ Run the sub-iteration loop.
54
+
55
+ Args:
56
+ task: The research task or question.
57
+ event_callback: Async callable to report events (e.g. to UI).
58
+
59
+ Returns:
60
+ Tuple of (best_result, final_assessment).
61
+ """
62
+ history: list[Any] = []
63
+ best_result: Any = None
64
+ final_assessment: JudgeAssessment | None = None
65
+
66
+ for i in range(1, self.max_iterations + 1):
67
+ logger.info("Sub-iteration starting", iteration=i, task=task)
68
+
69
+ if event_callback:
70
+ await event_callback(
71
+ AgentEvent(
72
+ type="looping",
73
+ message=f"Sub-iteration {i}: Executing task...",
74
+ iteration=i,
75
+ )
76
+ )
77
+
78
+ # 1. Team Execution
79
+ try:
80
+ result = await self.team.execute(task)
81
+ history.append(result)
82
+ best_result = result # Assume latest is best for now
83
+ except Exception as e:
84
+ logger.error("Sub-iteration execution failed", error=str(e))
85
+ if event_callback:
86
+ await event_callback(
87
+ AgentEvent(
88
+ type="error",
89
+ message=f"Sub-iteration execution failed: {e}",
90
+ iteration=i,
91
+ )
92
+ )
93
+ return best_result, final_assessment
94
+
95
+ # 2. Judge Assessment
96
+ try:
97
+ assessment = await self.judge.assess(task, result, history)
98
+ final_assessment = assessment
99
+ except Exception as e:
100
+ logger.error("Sub-iteration judge failed", error=str(e))
101
+ if event_callback:
102
+ await event_callback(
103
+ AgentEvent(
104
+ type="error",
105
+ message=f"Sub-iteration judge failed: {e}",
106
+ iteration=i,
107
+ )
108
+ )
109
+ return best_result, final_assessment
110
+
111
+ # 3. Decision
112
+ if assessment.sufficient:
113
+ logger.info("Sub-iteration sufficient", iteration=i)
114
+ return best_result, assessment
115
+
116
+ # If not sufficient, we might refine the task for the next iteration
117
+ # For this implementation, we assume the team is smart enough or the task stays same
118
+ # but we could append feedback to the task.
119
+
120
+ feedback = assessment.reasoning
121
+ logger.info("Sub-iteration insufficient", feedback=feedback)
122
+
123
+ if event_callback:
124
+ await event_callback(
125
+ AgentEvent(
126
+ type="looping",
127
+ message=(
128
+ f"Sub-iteration {i} result insufficient. Feedback: {feedback[:100]}..."
129
+ ),
130
+ iteration=i,
131
+ )
132
+ )
133
+
134
+ logger.warning("Sub-iteration max iterations reached", task=task)
135
+ return best_result, final_assessment
src/orchestrator_factory.py CHANGED
@@ -9,12 +9,29 @@ from src.legacy_orchestrator import (
9
  )
10
  from src.utils.models import OrchestratorConfig
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def create_orchestrator(
14
  search_handler: SearchHandlerProtocol | None = None,
15
  judge_handler: JudgeHandlerProtocol | None = None,
16
  config: OrchestratorConfig | None = None,
17
- mode: Literal["simple", "magentic"] = "simple",
18
  ) -> Any:
19
  """
20
  Create an orchestrator instance.
@@ -23,25 +40,19 @@ def create_orchestrator(
23
  search_handler: The search handler (required for simple mode)
24
  judge_handler: The judge handler (required for simple mode)
25
  config: Optional configuration
26
- mode: "simple" for Phase 4 loop, "magentic" for ChatAgent-based multi-agent
27
 
28
  Returns:
29
  Orchestrator instance
30
-
31
- Note:
32
- Magentic mode does NOT use search_handler/judge_handler.
33
- It creates ChatAgent instances with internal LLMs that call tools directly.
34
  """
35
- if mode == "magentic":
36
- try:
37
- from src.orchestrator_magentic import MagenticOrchestrator
38
 
39
- return MagenticOrchestrator(
40
- max_rounds=config.max_iterations if config else 10,
41
- )
42
- except ImportError:
43
- # Fallback to simple if agent-framework not installed
44
- pass
45
 
46
  # Simple mode requires handlers
47
  if search_handler is None or judge_handler is None:
@@ -52,3 +63,17 @@ def create_orchestrator(
52
  judge_handler=judge_handler,
53
  config=config,
54
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  )
10
  from src.utils.models import OrchestratorConfig
11
 
12
+ import structlog
13
+
14
+ logger = structlog.get_logger()
15
+
16
+
17
+ def _get_magentic_orchestrator_class() -> Any:
18
+ """Import MagenticOrchestrator lazily to avoid hard dependency."""
19
+ try:
20
+ from src.orchestrator_magentic import MagenticOrchestrator
21
+
22
+ return MagenticOrchestrator
23
+ except ImportError as e:
24
+ logger.error("Failed to import MagenticOrchestrator", error=str(e))
25
+ raise ValueError(
26
+ "Advanced mode requires agent-framework-core. Please install it or use mode='simple'."
27
+ ) from e
28
+
29
 
30
  def create_orchestrator(
31
  search_handler: SearchHandlerProtocol | None = None,
32
  judge_handler: JudgeHandlerProtocol | None = None,
33
  config: OrchestratorConfig | None = None,
34
+ mode: Literal["simple", "magentic", "advanced"] | None = None,
35
  ) -> Any:
36
  """
37
  Create an orchestrator instance.
 
40
  search_handler: The search handler (required for simple mode)
41
  judge_handler: The judge handler (required for simple mode)
42
  config: Optional configuration
43
+ mode: "simple", "magentic", "advanced" or None (auto-detect)
44
 
45
  Returns:
46
  Orchestrator instance
 
 
 
 
47
  """
48
+ effective_mode = _determine_mode(mode)
49
+ logger.info("Creating orchestrator", mode=effective_mode)
 
50
 
51
+ if effective_mode == "advanced":
52
+ orchestrator_cls = _get_magentic_orchestrator_class()
53
+ return orchestrator_cls(
54
+ max_rounds=config.max_iterations if config else 10,
55
+ )
 
56
 
57
  # Simple mode requires handlers
58
  if search_handler is None or judge_handler is None:
 
63
  judge_handler=judge_handler,
64
  config=config,
65
  )
66
+
67
+
68
+ def _determine_mode(explicit_mode: str | None) -> str:
69
+ """Determine which mode to use."""
70
+ if explicit_mode:
71
+ if explicit_mode in ("magentic", "advanced"):
72
+ return "advanced"
73
+ return "simple"
74
+
75
+ # Auto-detect: advanced if paid API key available
76
+ if settings.has_openai_key:
77
+ return "advanced"
78
+
79
+ return "simple"
src/orchestrator_hierarchical.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hierarchical orchestrator using middleware and sub-teams."""
2
+
3
+ import asyncio
4
+ from collections.abc import AsyncGenerator
5
+
6
+ import structlog
7
+
8
+ from src.agents.judge_agent_llm import LLMSubIterationJudge
9
+ from src.agents.magentic_agents import create_search_agent
10
+ from src.middleware.sub_iteration import SubIterationMiddleware, SubIterationTeam
11
+ from src.services.embeddings import get_embedding_service
12
+ from src.state import init_magentic_state
13
+ from src.utils.models import AgentEvent
14
+
15
+ logger = structlog.get_logger()
16
+
17
+
18
+ class ResearchTeam(SubIterationTeam):
19
+ """Adapts Magentic ChatAgent to SubIterationTeam protocol."""
20
+
21
+ def __init__(self) -> None:
22
+ self.agent = create_search_agent()
23
+
24
+ async def execute(self, task: str) -> str:
25
+ response = await self.agent.run(task)
26
+ if response.messages:
27
+ for msg in reversed(response.messages):
28
+ if msg.role == "assistant" and msg.text:
29
+ return str(msg.text)
30
+ return "No response from agent."
31
+
32
+
33
+ class HierarchicalOrchestrator:
34
+ """Orchestrator that uses hierarchical teams and sub-iterations."""
35
+
36
+ def __init__(self) -> None:
37
+ self.team = ResearchTeam()
38
+ self.judge = LLMSubIterationJudge()
39
+ self.middleware = SubIterationMiddleware(self.team, self.judge, max_iterations=5)
40
+
41
+ async def run(self, query: str) -> AsyncGenerator[AgentEvent, None]:
42
+ logger.info("Starting hierarchical orchestrator", query=query)
43
+
44
+ try:
45
+ service = get_embedding_service()
46
+ init_magentic_state(service)
47
+ except Exception as e:
48
+ logger.warning(
49
+ "Embedding service initialization failed, using default state",
50
+ error=str(e),
51
+ )
52
+ init_magentic_state()
53
+
54
+ yield AgentEvent(type="started", message=f"Starting research: {query}")
55
+
56
+ queue: asyncio.Queue[AgentEvent | None] = asyncio.Queue()
57
+
58
+ async def event_callback(event: AgentEvent) -> None:
59
+ await queue.put(event)
60
+
61
+ task_future = asyncio.create_task(self.middleware.run(query, event_callback))
62
+
63
+ while not task_future.done():
64
+ get_event = asyncio.create_task(queue.get())
65
+ done, _ = await asyncio.wait(
66
+ {task_future, get_event}, return_when=asyncio.FIRST_COMPLETED
67
+ )
68
+
69
+ if get_event in done:
70
+ event = get_event.result()
71
+ if event:
72
+ yield event
73
+ else:
74
+ get_event.cancel()
75
+
76
+ # Process remaining events
77
+ while not queue.empty():
78
+ ev = queue.get_nowait()
79
+ if ev:
80
+ yield ev
81
+
82
+ try:
83
+ result, assessment = await task_future
84
+
85
+ assessment_text = assessment.reasoning if assessment else "None"
86
+ yield AgentEvent(
87
+ type="complete",
88
+ message=(
89
+ f"Research complete.\n\nResult:\n{result}\n\nAssessment:\n{assessment_text}"
90
+ ),
91
+ data={"assessment": assessment.model_dump() if assessment else None},
92
+ )
93
+ except Exception as e:
94
+ logger.error("Orchestrator failed", error=str(e))
95
+ yield AgentEvent(type="error", message=f"Orchestrator failed: {e}")
src/orchestrator_magentic.py CHANGED
@@ -128,7 +128,7 @@ class MagenticOrchestrator:
128
  task = f"""Research drug repurposing opportunities for: {query}
129
 
130
  Workflow:
131
- 1. SearchAgent: Find evidence from PubMed, ClinicalTrials.gov, and bioRxiv
132
  2. HypothesisAgent: Generate mechanistic hypotheses (Drug -> Target -> Pathway -> Effect)
133
  3. JudgeAgent: Evaluate if evidence is sufficient
134
  4. If insufficient -> SearchAgent refines search based on gaps
@@ -158,10 +158,41 @@ The final output should be a structured research report."""
158
  iteration=iteration,
159
  )
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  def _process_event(self, event: Any, iteration: int) -> AgentEvent | None:
162
  """Process workflow event into AgentEvent."""
163
  if isinstance(event, MagenticOrchestratorMessageEvent):
164
- text = event.message.text if event.message else ""
165
  if text:
166
  return AgentEvent(
167
  type="judging",
@@ -171,7 +202,7 @@ The final output should be a structured research report."""
171
 
172
  elif isinstance(event, MagenticAgentMessageEvent):
173
  agent_name = event.agent_id or "unknown"
174
- text = event.message.text if event.message else ""
175
 
176
  event_type = "judging"
177
  if "search" in agent_name.lower():
@@ -190,7 +221,7 @@ The final output should be a structured research report."""
190
  )
191
 
192
  elif isinstance(event, MagenticFinalResultEvent):
193
- text = event.message.text if event.message else "No result"
194
  return AgentEvent(
195
  type="complete",
196
  message=text,
 
128
  task = f"""Research drug repurposing opportunities for: {query}
129
 
130
  Workflow:
131
+ 1. SearchAgent: Find evidence from PubMed, ClinicalTrials.gov, and Europe PMC
132
  2. HypothesisAgent: Generate mechanistic hypotheses (Drug -> Target -> Pathway -> Effect)
133
  3. JudgeAgent: Evaluate if evidence is sufficient
134
  4. If insufficient -> SearchAgent refines search based on gaps
 
158
  iteration=iteration,
159
  )
160
 
161
+ def _extract_text(self, message: Any) -> str:
162
+ """
163
+ Defensively extract text from a message object.
164
+
165
+ Fixes bug where message.text might return the object itself or its repr.
166
+ """
167
+ if not message:
168
+ return ""
169
+
170
+ # Priority 1: .content (often the raw string or list of content)
171
+ if hasattr(message, "content") and message.content:
172
+ content = message.content
173
+ # If it's a list (e.g., Multi-modal), join text parts
174
+ if isinstance(content, list):
175
+ return " ".join([str(c.text) for c in content if hasattr(c, "text")])
176
+ return str(content)
177
+
178
+ # Priority 2: .text (standard, but sometimes buggy/missing)
179
+ if hasattr(message, "text") and message.text:
180
+ # Verify it's not the object itself or a repr string
181
+ text = str(message.text)
182
+ if text.startswith("<") and "object at" in text:
183
+ # Likely a repr string, ignore if possible
184
+ pass
185
+ else:
186
+ return text
187
+
188
+ # Fallback: If we can't find clean text, return str(message)
189
+ # taking care to avoid infinite recursion if str() calls .text
190
+ return str(message)
191
+
192
  def _process_event(self, event: Any, iteration: int) -> AgentEvent | None:
193
  """Process workflow event into AgentEvent."""
194
  if isinstance(event, MagenticOrchestratorMessageEvent):
195
+ text = self._extract_text(event.message)
196
  if text:
197
  return AgentEvent(
198
  type="judging",
 
202
 
203
  elif isinstance(event, MagenticAgentMessageEvent):
204
  agent_name = event.agent_id or "unknown"
205
+ text = self._extract_text(event.message)
206
 
207
  event_type = "judging"
208
  if "search" in agent_name.lower():
 
221
  )
222
 
223
  elif isinstance(event, MagenticFinalResultEvent):
224
+ text = self._extract_text(event.message) if event.message else "No result"
225
  return AgentEvent(
226
  type="complete",
227
  message=text,
src/state/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """State package - re-exports from agents.state for compatibility."""
2
+
3
+ from src.agents.state import (
4
+ MagenticState,
5
+ get_magentic_state,
6
+ init_magentic_state,
7
+ )
8
+
9
+ __all__ = ["MagenticState", "get_magentic_state", "init_magentic_state"]
src/tools/__init__.py CHANGED
@@ -1,6 +1,8 @@
1
  """Search tools package."""
2
 
3
  from src.tools.base import SearchTool
 
 
4
  from src.tools.pubmed import PubMedTool
5
  from src.tools.rag_tool import RAGTool, create_rag_tool
6
  from src.tools.search_handler import SearchHandler
 
1
  """Search tools package."""
2
 
3
  from src.tools.base import SearchTool
4
+ from src.tools.clinicaltrials import ClinicalTrialsTool
5
+ from src.tools.europepmc import EuropePMCTool
6
  from src.tools.pubmed import PubMedTool
7
  from src.tools.rag_tool import RAGTool, create_rag_tool
8
  from src.tools.search_handler import SearchHandler
src/tools/pubmed.py CHANGED
@@ -1,6 +1,5 @@
1
  """PubMed search tool using NCBI E-utilities."""
2
 
3
- import asyncio
4
  from typing import Any
5
 
6
  import httpx
@@ -8,6 +7,7 @@ import xmltodict
8
  from tenacity import retry, stop_after_attempt, wait_exponential
9
 
10
  from src.tools.query_utils import preprocess_query
 
11
  from src.utils.config import settings
12
  from src.utils.exceptions import RateLimitError, SearchError
13
  from src.utils.models import Citation, Evidence
@@ -17,7 +17,6 @@ class PubMedTool:
17
  """Search tool for PubMed/NCBI."""
18
 
19
  BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
20
- RATE_LIMIT_DELAY = 0.34 # ~3 requests/sec without API key
21
  HTTP_TOO_MANY_REQUESTS = 429
22
 
23
  def __init__(self, api_key: str | None = None) -> None:
@@ -25,7 +24,9 @@ class PubMedTool:
25
  # Ignore placeholder values from .env.example
26
  if self.api_key == "your-ncbi-key-here":
27
  self.api_key = None
28
- self._last_request_time = 0.0
 
 
29
 
30
  @property
31
  def name(self) -> str:
@@ -33,12 +34,7 @@ class PubMedTool:
33
 
34
  async def _rate_limit(self) -> None:
35
  """Enforce NCBI rate limiting."""
36
- loop = asyncio.get_running_loop()
37
- now = loop.time()
38
- elapsed = now - self._last_request_time
39
- if elapsed < self.RATE_LIMIT_DELAY:
40
- await asyncio.sleep(self.RATE_LIMIT_DELAY - elapsed)
41
- self._last_request_time = loop.time()
42
 
43
  def _build_params(self, **kwargs: Any) -> dict[str, Any]:
44
  """Build request params with optional API key."""
 
1
  """PubMed search tool using NCBI E-utilities."""
2
 
 
3
  from typing import Any
4
 
5
  import httpx
 
7
  from tenacity import retry, stop_after_attempt, wait_exponential
8
 
9
  from src.tools.query_utils import preprocess_query
10
+ from src.tools.rate_limiter import get_pubmed_limiter
11
  from src.utils.config import settings
12
  from src.utils.exceptions import RateLimitError, SearchError
13
  from src.utils.models import Citation, Evidence
 
17
  """Search tool for PubMed/NCBI."""
18
 
19
  BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
 
20
  HTTP_TOO_MANY_REQUESTS = 429
21
 
22
  def __init__(self, api_key: str | None = None) -> None:
 
24
  # Ignore placeholder values from .env.example
25
  if self.api_key == "your-ncbi-key-here":
26
  self.api_key = None
27
+
28
+ # Use shared rate limiter
29
+ self._limiter = get_pubmed_limiter(self.api_key)
30
 
31
  @property
32
  def name(self) -> str:
 
34
 
35
  async def _rate_limit(self) -> None:
36
  """Enforce NCBI rate limiting."""
37
+ await self._limiter.acquire()
 
 
 
 
 
38
 
39
  def _build_params(self, **kwargs: Any) -> dict[str, Any]:
40
  """Build request params with optional API key."""
src/tools/rate_limiter.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rate limiting utilities using the limits library."""
2
+
3
+ import asyncio
4
+ from typing import ClassVar
5
+
6
+ from limits import RateLimitItem, parse
7
+ from limits.storage import MemoryStorage
8
+ from limits.strategies import MovingWindowRateLimiter
9
+
10
+
11
+ class RateLimiter:
12
+ """
13
+ Async-compatible rate limiter using limits library.
14
+
15
+ Uses moving window algorithm for smooth rate limiting.
16
+ """
17
+
18
+ def __init__(self, rate: str) -> None:
19
+ """
20
+ Initialize rate limiter.
21
+
22
+ Args:
23
+ rate: Rate string like "3/second" or "10/second"
24
+ """
25
+ self.rate = rate
26
+ self._storage = MemoryStorage()
27
+ self._limiter = MovingWindowRateLimiter(self._storage)
28
+ self._rate_limit: RateLimitItem = parse(rate)
29
+ self._identity = "default" # Single identity for shared limiting
30
+
31
+ async def acquire(self, wait: bool = True) -> bool:
32
+ """
33
+ Acquire permission to make a request.
34
+
35
+ ASYNC-SAFE: Uses asyncio.sleep(), never time.sleep().
36
+ The polling pattern allows other coroutines to run while waiting.
37
+
38
+ Args:
39
+ wait: If True, wait until allowed. If False, return immediately.
40
+
41
+ Returns:
42
+ True if allowed, False if not (only when wait=False)
43
+ """
44
+ while True:
45
+ # Check if we can proceed (synchronous, fast - ~microseconds)
46
+ if self._limiter.hit(self._rate_limit, self._identity):
47
+ return True
48
+
49
+ if not wait:
50
+ return False
51
+
52
+ # CRITICAL: Use asyncio.sleep(), NOT time.sleep()
53
+ # This yields control to the event loop, allowing other
54
+ # coroutines (UI, parallel searches) to run.
55
+ # Using 0.01s for fine-grained responsiveness.
56
+ await asyncio.sleep(0.01)
57
+
58
+ def reset(self) -> None:
59
+ """Reset the rate limiter (for testing)."""
60
+ self._storage.reset()
61
+
62
+
63
+ # Singleton limiter for PubMed/NCBI
64
+ _pubmed_limiter: RateLimiter | None = None
65
+
66
+
67
+ def get_pubmed_limiter(api_key: str | None = None) -> RateLimiter:
68
+ """
69
+ Get the shared PubMed rate limiter.
70
+
71
+ Rate depends on whether API key is provided:
72
+ - Without key: 3 requests/second
73
+ - With key: 10 requests/second
74
+
75
+ Args:
76
+ api_key: NCBI API key (optional)
77
+
78
+ Returns:
79
+ Shared RateLimiter instance
80
+ """
81
+ global _pubmed_limiter
82
+
83
+ if _pubmed_limiter is None:
84
+ rate = "10/second" if api_key else "3/second"
85
+ _pubmed_limiter = RateLimiter(rate)
86
+
87
+ return _pubmed_limiter
88
+
89
+
90
+ def reset_pubmed_limiter() -> None:
91
+ """Reset the PubMed limiter (for testing)."""
92
+ global _pubmed_limiter
93
+ _pubmed_limiter = None
94
+
95
+
96
+ # Factory for other APIs
97
+ class RateLimiterFactory:
98
+ """Factory for creating/getting rate limiters for different APIs."""
99
+
100
+ _limiters: ClassVar[dict[str, RateLimiter]] = {}
101
+
102
+ @classmethod
103
+ def get(cls, api_name: str, rate: str) -> RateLimiter:
104
+ """
105
+ Get or create a rate limiter for an API.
106
+
107
+ Args:
108
+ api_name: Unique identifier for the API
109
+ rate: Rate limit string (e.g., "10/second")
110
+
111
+ Returns:
112
+ RateLimiter instance (shared for same api_name)
113
+ """
114
+ if api_name not in cls._limiters:
115
+ cls._limiters[api_name] = RateLimiter(rate)
116
+ return cls._limiters[api_name]
117
+
118
+ @classmethod
119
+ def reset_all(cls) -> None:
120
+ """Reset all limiters (for testing)."""
121
+ cls._limiters.clear()
src/tools/web_search.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Web search tool using DuckDuckGo."""
2
+
3
+ import asyncio
4
+
5
+ import structlog
6
+ from duckduckgo_search import DDGS
7
+
8
+ from src.utils.models import Citation, Evidence, SearchResult
9
+
10
+ logger = structlog.get_logger()
11
+
12
+
13
+ class WebSearchTool:
14
+ """Tool for searching the web using DuckDuckGo."""
15
+
16
+ def __init__(self) -> None:
17
+ self._ddgs = DDGS()
18
+
19
+ async def search(self, query: str, max_results: int = 10) -> SearchResult:
20
+ """Execute a web search."""
21
+ try:
22
+ loop = asyncio.get_running_loop()
23
+
24
+ def _do_search() -> list[dict[str, str]]:
25
+ # text() returns an iterator, need to list() it or iterate
26
+ return list(self._ddgs.text(query, max_results=max_results))
27
+
28
+ raw_results = await loop.run_in_executor(None, _do_search)
29
+
30
+ evidence = []
31
+ for r in raw_results:
32
+ ev = Evidence(
33
+ content=r.get("body", ""),
34
+ citation=Citation(
35
+ title=r.get("title", "No Title"),
36
+ url=r.get("href", ""),
37
+ source="web",
38
+ date="Unknown",
39
+ authors=[],
40
+ ),
41
+ relevance=0.0,
42
+ )
43
+ evidence.append(ev)
44
+
45
+ return SearchResult(
46
+ query=query, evidence=evidence, sources_searched=["web"], total_found=len(evidence)
47
+ )
48
+
49
+ except Exception as e:
50
+ logger.error("Web search failed", error=str(e))
51
+ return SearchResult(
52
+ query=query, evidence=[], sources_searched=["web"], total_found=0, errors=[str(e)]
53
+ )
src/utils/config.py CHANGED
@@ -23,13 +23,20 @@ class Settings(BaseSettings):
23
  # LLM Configuration
24
  openai_api_key: str | None = Field(default=None, description="OpenAI API key")
25
  anthropic_api_key: str | None = Field(default=None, description="Anthropic API key")
26
- llm_provider: Literal["openai", "anthropic"] = Field(
27
  default="openai", description="Which LLM provider to use"
28
  )
29
  openai_model: str = Field(default="gpt-5.1", description="OpenAI model name")
30
  anthropic_model: str = Field(
31
  default="claude-sonnet-4-5-20250929", description="Anthropic model"
32
  )
 
 
 
 
 
 
 
33
 
34
  # Embedding Configuration
35
  # Note: OpenAI embeddings require OPENAI_API_KEY (Anthropic has no embeddings API)
@@ -175,10 +182,15 @@ class Settings(BaseSettings):
175
  """Check if Anthropic API key is available."""
176
  return bool(self.anthropic_api_key)
177
 
 
 
 
 
 
178
  @property
179
  def has_any_llm_key(self) -> bool:
180
  """Check if any LLM API key is available."""
181
- return self.has_openai_key or self.has_anthropic_key
182
 
183
  @property
184
  def has_huggingface_key(self) -> bool:
 
23
  # LLM Configuration
24
  openai_api_key: str | None = Field(default=None, description="OpenAI API key")
25
  anthropic_api_key: str | None = Field(default=None, description="Anthropic API key")
26
+ llm_provider: Literal["openai", "anthropic", "huggingface"] = Field(
27
  default="openai", description="Which LLM provider to use"
28
  )
29
  openai_model: str = Field(default="gpt-5.1", description="OpenAI model name")
30
  anthropic_model: str = Field(
31
  default="claude-sonnet-4-5-20250929", description="Anthropic model"
32
  )
33
+ # HuggingFace (free tier)
34
+ huggingface_model: str | None = Field(
35
+ default="meta-llama/Llama-3.1-70B-Instruct", description="HuggingFace model name"
36
+ )
37
+ hf_token: str | None = Field(
38
+ default=None, alias="HF_TOKEN", description="HuggingFace API token"
39
+ )
40
 
41
  # Embedding Configuration
42
  # Note: OpenAI embeddings require OPENAI_API_KEY (Anthropic has no embeddings API)
 
182
  """Check if Anthropic API key is available."""
183
  return bool(self.anthropic_api_key)
184
 
185
+ @property
186
+ def has_huggingface_key(self) -> bool:
187
+ """Check if HuggingFace token is available."""
188
+ return bool(self.hf_token)
189
+
190
  @property
191
  def has_any_llm_key(self) -> bool:
192
  """Check if any LLM API key is available."""
193
+ return self.has_openai_key or self.has_anthropic_key or self.has_huggingface_key
194
 
195
  @property
196
  def has_huggingface_key(self) -> bool:
src/utils/models.py CHANGED
@@ -36,6 +36,10 @@ class Evidence(BaseModel):
36
  content: str = Field(min_length=1, description="The actual text content")
37
  citation: Citation
38
  relevance: float = Field(default=0.0, ge=0.0, le=1.0, description="Relevance score 0-1")
 
 
 
 
39
 
40
  model_config = {"frozen": True}
41
 
 
36
  content: str = Field(min_length=1, description="The actual text content")
37
  citation: Citation
38
  relevance: float = Field(default=0.0, ge=0.0, le=1.0, description="Relevance score 0-1")
39
+ metadata: dict[str, Any] = Field(
40
+ default_factory=dict,
41
+ description="Additional metadata (e.g., cited_by_count, concepts, is_open_access)",
42
+ )
43
 
44
  model_config = {"frozen": True}
45
 
tests/integration/test_dual_mode_e2e.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """End-to-End Integration Tests for Dual-Mode Architecture."""
2
+
3
+ from unittest.mock import AsyncMock, MagicMock, patch
4
+
5
+ import pytest
6
+
7
+ pytestmark = [pytest.mark.integration, pytest.mark.slow]
8
+
9
+ from src.orchestrator_factory import create_orchestrator
10
+ from src.utils.models import Citation, Evidence, OrchestratorConfig
11
+
12
+
13
+ @pytest.fixture
14
+ def mock_search_handler():
15
+ handler = MagicMock()
16
+ handler.execute = AsyncMock(
17
+ return_value=[
18
+ Evidence(
19
+ citation=Citation(
20
+ title="Test Paper", url="http://test", date="2024", source="pubmed"
21
+ ),
22
+ content="Metformin increases lifespan in mice.",
23
+ )
24
+ ]
25
+ )
26
+ return handler
27
+
28
+
29
+ @pytest.fixture
30
+ def mock_judge_handler():
31
+ handler = MagicMock()
32
+ # Mock return value of assess
33
+ assessment = MagicMock()
34
+ assessment.sufficient = True
35
+ assessment.recommendation = "synthesize"
36
+ handler.assess = AsyncMock(return_value=assessment)
37
+ return handler
38
+
39
+
40
+ @pytest.mark.asyncio
41
+ async def test_simple_mode_e2e(mock_search_handler, mock_judge_handler):
42
+ """Test Simple Mode Orchestration flow."""
43
+ orch = create_orchestrator(
44
+ search_handler=mock_search_handler,
45
+ judge_handler=mock_judge_handler,
46
+ mode="simple",
47
+ config=OrchestratorConfig(max_iterations=1),
48
+ )
49
+
50
+ # Run
51
+ results = []
52
+ async for event in orch.run("Test query"):
53
+ results.append(event)
54
+
55
+ assert len(results) > 0
56
+ assert mock_search_handler.execute.called
57
+ assert mock_judge_handler.assess.called
58
+
59
+
60
+ @pytest.mark.asyncio
61
+ async def test_advanced_mode_explicit_instantiation():
62
+ """Test explicit Advanced Mode instantiation (not auto-detect).
63
+
64
+ This tests the explicit mode="advanced" path, verifying that
65
+ MagenticOrchestrator can be instantiated when explicitly requested.
66
+ The settings patch ensures any internal checks pass.
67
+ """
68
+ with patch("src.orchestrator_factory.settings") as mock_settings:
69
+ # Settings patch ensures factory checks pass (even though mode is explicit)
70
+ mock_settings.has_openai_key = True
71
+
72
+ with patch("src.agents.magentic_agents.OpenAIChatClient"):
73
+ # Mock agent creation to avoid real API calls during init
74
+ with (
75
+ patch("src.orchestrator_magentic.create_search_agent"),
76
+ patch("src.orchestrator_magentic.create_judge_agent"),
77
+ patch("src.orchestrator_magentic.create_hypothesis_agent"),
78
+ patch("src.orchestrator_magentic.create_report_agent"),
79
+ ):
80
+ # Explicit mode="advanced" - tests the explicit path, not auto-detect
81
+ orch = create_orchestrator(mode="advanced")
82
+ assert orch is not None
tests/integration/test_modal.py CHANGED
@@ -1,4 +1,4 @@
1
- """Integration tests for Modal (requires credentials)."""
2
 
3
  import pytest
4
 
@@ -7,9 +7,18 @@ from src.utils.config import settings
7
  # Check if any LLM API key is available
8
  _llm_available = bool(settings.openai_api_key or settings.anthropic_api_key)
9
 
 
 
 
 
 
 
 
 
10
 
11
  @pytest.mark.integration
12
- @pytest.mark.skipif(not settings.modal_available, reason="Modal not configured")
 
13
  class TestModalIntegration:
14
  """Integration tests requiring Modal credentials."""
15
 
 
1
+ """Integration tests for Modal (requires credentials and modal package)."""
2
 
3
  import pytest
4
 
 
7
  # Check if any LLM API key is available
8
  _llm_available = bool(settings.openai_api_key or settings.anthropic_api_key)
9
 
10
+ # Check if modal package is installed
11
+ try:
12
+ import modal # noqa: F401
13
+
14
+ _modal_installed = True
15
+ except ImportError:
16
+ _modal_installed = False
17
+
18
 
19
  @pytest.mark.integration
20
+ @pytest.mark.skipif(not _modal_installed, reason="Modal package not installed")
21
+ @pytest.mark.skipif(not settings.modal_available, reason="Modal credentials not configured")
22
  class TestModalIntegration:
23
  """Integration tests requiring Modal credentials."""
24
 
tests/unit/agent_factory/test_judges_factory.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for Judge Factory and Model Selection."""
2
+
3
+ from unittest.mock import patch
4
+
5
+ import pytest
6
+
7
+ pytestmark = pytest.mark.unit
8
+ from pydantic_ai.models.anthropic import AnthropicModel
9
+
10
+ # We expect this import to exist after we implement it, or we mock it if it's not there yet
11
+ # For TDD, we assume we will use the library class
12
+ from pydantic_ai.models.huggingface import HuggingFaceModel
13
+ from pydantic_ai.models.openai import OpenAIModel
14
+
15
+ from src.agent_factory.judges import get_model
16
+
17
+
18
+ @pytest.fixture
19
+ def mock_settings():
20
+ with patch("src.agent_factory.judges.settings", autospec=True) as mock_settings:
21
+ yield mock_settings
22
+
23
+
24
+ def test_get_model_openai(mock_settings):
25
+ """Test that OpenAI model is returned when provider is openai."""
26
+ mock_settings.llm_provider = "openai"
27
+ mock_settings.openai_api_key = "sk-test"
28
+ mock_settings.openai_model = "gpt-5.1"
29
+
30
+ model = get_model()
31
+ assert isinstance(model, OpenAIModel)
32
+ assert model.model_name == "gpt-5.1"
33
+
34
+
35
+ def test_get_model_anthropic(mock_settings):
36
+ """Test that Anthropic model is returned when provider is anthropic."""
37
+ mock_settings.llm_provider = "anthropic"
38
+ mock_settings.anthropic_api_key = "sk-ant-test"
39
+ mock_settings.anthropic_model = "claude-sonnet-4-5-20250929"
40
+
41
+ model = get_model()
42
+ assert isinstance(model, AnthropicModel)
43
+ assert model.model_name == "claude-sonnet-4-5-20250929"
44
+
45
+
46
+ def test_get_model_huggingface(mock_settings):
47
+ """Test that HuggingFace model is returned when provider is huggingface."""
48
+ mock_settings.llm_provider = "huggingface"
49
+ mock_settings.hf_token = "hf_test_token"
50
+ mock_settings.huggingface_model = "meta-llama/Llama-3.1-70B-Instruct"
51
+
52
+ model = get_model()
53
+ assert isinstance(model, HuggingFaceModel)
54
+ assert model.model_name == "meta-llama/Llama-3.1-70B-Instruct"
55
+
56
+
57
+ def test_get_model_default_fallback(mock_settings):
58
+ """Test fallback to OpenAI if provider is unknown."""
59
+ mock_settings.llm_provider = "unknown_provider"
60
+ mock_settings.openai_api_key = "sk-test"
61
+ mock_settings.openai_model = "gpt-5.1"
62
+
63
+ model = get_model()
64
+ assert isinstance(model, OpenAIModel)