diff --git a/.env copy.example b/.env copy.example
deleted file mode 100644
index b8061357538326dd7fad717c627cdcfa5c0b3eb9..0000000000000000000000000000000000000000
--- a/.env copy.example
+++ /dev/null
@@ -1,124 +0,0 @@
-# ============== LLM CONFIGURATION ==============
-
-# Provider: "openai", "anthropic", or "huggingface"
-LLM_PROVIDER=openai
-
-# API Keys (at least one required for full LLM analysis)
-OPENAI_API_KEY=sk-your-key-here
-ANTHROPIC_API_KEY=sk-ant-your-key-here
-
-# Model names (optional - sensible defaults set in config.py)
-# OPENAI_MODEL=gpt-5.1
-# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
-
-# ============== HUGGINGFACE CONFIGURATION ==============
-
-# HuggingFace Token - enables gated models and higher rate limits
-# Get yours at: https://huggingface.co/settings/tokens
-#
-# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta, Qwen2-7B)
-# WITH HF_TOKEN: Uses gated models (Llama 3.1, Gemma-2) via inference providers
-#
-# For HuggingFace Spaces deployment:
-# Set this as a "Secret" in Space Settings -> Variables and secrets
-# Users/judges don't need their own token - the Space secret is used
-#
-HF_TOKEN=hf_your-token-here
-# Alternative: HUGGINGFACE_API_KEY (same as HF_TOKEN)
-
-# Default HuggingFace model for inference (gated, requires auth)
-# Can be overridden in UI dropdown
-# Latest reasoning models: Qwen3-Next-80B-A3B-Thinking, Qwen3-Next-80B-A3B-Instruct, Llama-3.3-70B-Instruct
-HUGGINGFACE_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
-
-# Fallback models for HuggingFace Inference API (comma-separated)
-# Models are tried in order until one succeeds
-# Format: model1,model2,model3
-# Latest reasoning models first, then reliable fallbacks
-# Reasoning models: Qwen3-Next (thinking/instruct), Llama-3.3-70B, Qwen3-235B
-# Fallbacks: Llama-3.1-8B, Zephyr-7B (ungated), Qwen2-7B (ungated)
-HF_FALLBACK_MODELS=Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen/Qwen3-Next-80B-A3B-Instruct,meta-llama/Llama-3.3-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,HuggingFaceH4/zephyr-7b-beta,Qwen/Qwen2-7B-Instruct
-
-# Override model/provider selection (optional, usually set via UI)
-# HF_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
-# HF_PROVIDER=hyperbolic
-
-# ============== EMBEDDING CONFIGURATION ==============
-
-# Embedding Provider: "openai", "local", or "huggingface"
-# Default: "local" (no API key required)
-EMBEDDING_PROVIDER=local
-
-# OpenAI Embedding Model (used if EMBEDDING_PROVIDER=openai)
-OPENAI_EMBEDDING_MODEL=text-embedding-3-small
-
-# Local Embedding Model (sentence-transformers, used if EMBEDDING_PROVIDER=local)
-# BAAI/bge-small-en-v1.5 is newer, faster, and better than all-MiniLM-L6-v2
-LOCAL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
-
-# HuggingFace Embedding Model (used if EMBEDDING_PROVIDER=huggingface)
-HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
-
-# ============== AGENT CONFIGURATION ==============
-
-MAX_ITERATIONS=10
-SEARCH_TIMEOUT=30
-LOG_LEVEL=INFO
-
-# Graph-based execution (experimental)
-# USE_GRAPH_EXECUTION=false
-
-# Budget & Rate Limiting
-# DEFAULT_TOKEN_LIMIT=100000
-# DEFAULT_TIME_LIMIT_MINUTES=10
-# DEFAULT_ITERATIONS_LIMIT=10
-
-# ============== WEB SEARCH CONFIGURATION ==============
-
-# Web Search Provider: "serper", "searchxng", "brave", "tavily", or "duckduckgo"
-# Default: "duckduckgo" (no API key required)
-WEB_SEARCH_PROVIDER=duckduckgo
-
-# Serper API Key (for Google search via Serper)
-# SERPER_API_KEY=your-serper-key-here
-
-# SearchXNG Host URL (for self-hosted search)
-# SEARCHXNG_HOST=http://localhost:8080
-
-# Brave Search API Key
-# BRAVE_API_KEY=your-brave-key-here
-
-# Tavily API Key
-# TAVILY_API_KEY=your-tavily-key-here
-
-# ============== EXTERNAL SERVICES ==============
-
-# PubMed (optional - higher rate limits: 10 req/sec vs 3 req/sec)
-NCBI_API_KEY=your-ncbi-key-here
-
-# Modal (optional - for secure code execution sandbox)
-# MODAL_TOKEN_ID=your-modal-token-id
-# MODAL_TOKEN_SECRET=your-modal-token-secret
-
-# ============== VECTOR DATABASE (ChromaDB) ==============
-
-# ChromaDB storage path
-CHROMA_DB_PATH=./chroma_db
-
-# Persist ChromaDB to disk (default: true)
-# CHROMA_DB_PERSIST=true
-
-# Remote ChromaDB server (optional)
-# CHROMA_DB_HOST=localhost
-# CHROMA_DB_PORT=8000
-
-# ============== RAG SERVICE CONFIGURATION ==============
-
-# ChromaDB collection name for RAG
-# RAG_COLLECTION_NAME=deepcritical_evidence
-
-# Number of top results to retrieve from RAG
-# RAG_SIMILARITY_TOP_K=5
-
-# Automatically ingest evidence into RAG
-# RAG_AUTO_INGEST=true
diff --git a/.env.example b/.env.example
index cfea522c8e49c8e8de6145965e6269cbd616b788..b8061357538326dd7fad717c627cdcfa5c0b3eb9 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,6 @@
# ============== LLM CONFIGURATION ==============
-# Provider: "openai" or "anthropic"
+# Provider: "openai", "anthropic", or "huggingface"
LLM_PROVIDER=openai
# API Keys (at least one required for full LLM analysis)
@@ -8,30 +8,56 @@ OPENAI_API_KEY=sk-your-key-here
ANTHROPIC_API_KEY=sk-ant-your-key-here
# Model names (optional - sensible defaults set in config.py)
-# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
# OPENAI_MODEL=gpt-5.1
+# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
-# ============== EMBEDDINGS ==============
-
-# OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
-OPENAI_EMBEDDING_MODEL=text-embedding-3-small
-
-# Local Embedding Model (used for local/offline embeddings)
-LOCAL_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
-
-# ============== HUGGINGFACE (FREE TIER) ==============
+# ============== HUGGINGFACE CONFIGURATION ==============
-# HuggingFace Token - enables Llama 3.1 (best quality free model)
+# HuggingFace Token - enables gated models and higher rate limits
# Get yours at: https://huggingface.co/settings/tokens
-#
-# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta)
-# WITH HF_TOKEN: Uses Llama 3.1 8B Instruct (requires accepting license)
+#
+# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta, Qwen2-7B)
+# WITH HF_TOKEN: Uses gated models (Llama 3.1, Gemma-2) via inference providers
#
# For HuggingFace Spaces deployment:
# Set this as a "Secret" in Space Settings -> Variables and secrets
# Users/judges don't need their own token - the Space secret is used
#
HF_TOKEN=hf_your-token-here
+# Alternative: HUGGINGFACE_API_KEY (same as HF_TOKEN)
+
+# Default HuggingFace model for inference (gated, requires auth)
+# Can be overridden in UI dropdown
+# Latest reasoning models: Qwen3-Next-80B-A3B-Thinking, Qwen3-Next-80B-A3B-Instruct, Llama-3.3-70B-Instruct
+HUGGINGFACE_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
+
+# Fallback models for HuggingFace Inference API (comma-separated)
+# Models are tried in order until one succeeds
+# Format: model1,model2,model3
+# Latest reasoning models first, then reliable fallbacks
+# Reasoning models: Qwen3-Next (thinking/instruct), Llama-3.3-70B, Qwen3-235B
+# Fallbacks: Llama-3.1-8B, Zephyr-7B (ungated), Qwen2-7B (ungated)
+HF_FALLBACK_MODELS=Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen/Qwen3-Next-80B-A3B-Instruct,meta-llama/Llama-3.3-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,HuggingFaceH4/zephyr-7b-beta,Qwen/Qwen2-7B-Instruct
+
+# Override model/provider selection (optional, usually set via UI)
+# HF_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
+# HF_PROVIDER=hyperbolic
+
+# ============== EMBEDDING CONFIGURATION ==============
+
+# Embedding Provider: "openai", "local", or "huggingface"
+# Default: "local" (no API key required)
+EMBEDDING_PROVIDER=local
+
+# OpenAI Embedding Model (used if EMBEDDING_PROVIDER=openai)
+OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+
+# Local Embedding Model (sentence-transformers, used if EMBEDDING_PROVIDER=local)
+# BAAI/bge-small-en-v1.5 is newer, faster, and better than all-MiniLM-L6-v2
+LOCAL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
+
+# HuggingFace Embedding Model (used if EMBEDDING_PROVIDER=huggingface)
+HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
# ============== AGENT CONFIGURATION ==============
@@ -39,10 +65,60 @@ MAX_ITERATIONS=10
SEARCH_TIMEOUT=30
LOG_LEVEL=INFO
+# Graph-based execution (experimental)
+# USE_GRAPH_EXECUTION=false
+
+# Budget & Rate Limiting
+# DEFAULT_TOKEN_LIMIT=100000
+# DEFAULT_TIME_LIMIT_MINUTES=10
+# DEFAULT_ITERATIONS_LIMIT=10
+
+# ============== WEB SEARCH CONFIGURATION ==============
+
+# Web Search Provider: "serper", "searchxng", "brave", "tavily", or "duckduckgo"
+# Default: "duckduckgo" (no API key required)
+WEB_SEARCH_PROVIDER=duckduckgo
+
+# Serper API Key (for Google search via Serper)
+# SERPER_API_KEY=your-serper-key-here
+
+# SearchXNG Host URL (for self-hosted search)
+# SEARCHXNG_HOST=http://localhost:8080
+
+# Brave Search API Key
+# BRAVE_API_KEY=your-brave-key-here
+
+# Tavily API Key
+# TAVILY_API_KEY=your-tavily-key-here
+
# ============== EXTERNAL SERVICES ==============
-# PubMed (optional - higher rate limits)
+# PubMed (optional - higher rate limits: 10 req/sec vs 3 req/sec)
NCBI_API_KEY=your-ncbi-key-here
-# Vector Database (optional - for LlamaIndex RAG)
+# Modal (optional - for secure code execution sandbox)
+# MODAL_TOKEN_ID=your-modal-token-id
+# MODAL_TOKEN_SECRET=your-modal-token-secret
+
+# ============== VECTOR DATABASE (ChromaDB) ==============
+
+# ChromaDB storage path
CHROMA_DB_PATH=./chroma_db
+
+# Persist ChromaDB to disk (default: true)
+# CHROMA_DB_PERSIST=true
+
+# Remote ChromaDB server (optional)
+# CHROMA_DB_HOST=localhost
+# CHROMA_DB_PORT=8000
+
+# ============== RAG SERVICE CONFIGURATION ==============
+
+# ChromaDB collection name for RAG
+# RAG_COLLECTION_NAME=deepcritical_evidence
+
+# Number of top results to retrieve from RAG
+# RAG_SIMILARITY_TOP_K=5
+
+# Automatically ingest evidence into RAG
+# RAG_AUTO_INGEST=true
diff --git a/.github/README.md b/.github/README.md
index 7f9634bf5d792f81f450f88f0d607ab1fe3f2956..c573b60ebe7f6f8264b6e31d32793100968a1965 100644
--- a/.github/README.md
+++ b/.github/README.md
@@ -1,28 +1,3 @@
----
-title: DeepCritical
-emoji: 🧬
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: "6.0.1"
-python_version: "3.11"
-app_file: src/app.py
-hf_oauth: true
-hf_oauth_expiration_minutes: 480
-hf_oauth_scopes:
- - inference-api
-pinned: true
-license: mit
-tags:
- - mcp-in-action-track-enterprise
- - mcp-hackathon
- - drug-repurposing
- - biomedical-ai
- - pydantic-ai
- - llamaindex
- - modal
----
-
[](https://github.com/DeepCritical/GradioDemo)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cfeb4ebd2ef145b6f7da1af5e790d430bae99eff..4481459350e12a71337b5c32f804f13a24b33c62 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,19 +33,19 @@ jobs:
- name: Lint with ruff
continue-on-error: true
run: |
- uv run ruff check . --exclude tests
- uv run ruff format --check . --exclude tests
+ uv run ruff check . --exclude tests --exclude reference_repos
+ uv run ruff format --check . --exclude tests --exclude reference_repos
- name: Type check with mypy
continue-on-error: true
run: |
- uv run mypy src
+ uv run mypy src --ignore-missing-imports
- - name: Run unit tests (No Black Box Apis)
+ - name: Run unit tests (No OpenAI/Anthropic, HuggingFace only)
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
- uv run pytest tests/unit/ -v -m "not openai and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml
+ uv run pytest tests/unit/ -v -m "not openai and not anthropic and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml
- name: Run local embeddings tests
env:
@@ -61,11 +61,11 @@ jobs:
uv run pytest tests/integration/ -v -m "huggingface and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
continue-on-error: true # Allow failures if HF_TOKEN not set
- - name: Run non-OpenAI integration tests (excluding embedding providers)
+ - name: Run non-OpenAI/Anthropic integration tests (excluding embedding providers)
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
- uv run pytest tests/integration/ -v -m "integration and not openai and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
+ uv run pytest tests/integration/ -v -m "integration and not openai and not anthropic and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
continue-on-error: true # Allow failures if dependencies not available
- name: Upload coverage reports to Codecov
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d08dd3bf813709c4c4df5a8fc5f6ebdb16c84f3..66993b5ec97b1bfa659fc9cdc9b3a323372d56ee 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,16 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.4.4
+ rev: v0.14.7 # Compatible with ruff>=0.14.6 (matches CI)
hooks:
- id: ruff
- args: [--fix, --exclude, tests]
+ args: [--fix, --exclude, tests, --exclude, reference_repos]
exclude: ^reference_repos/
- id: ruff-format
- args: [--exclude, tests]
+ args: [--exclude, tests, --exclude, reference_repos]
exclude: ^reference_repos/
- repo: https://github.com/pre-commit/mirrors-mypy
- rev: v1.10.0
+ rev: v1.18.2 # Matches CI version mypy>=1.18.2
hooks:
- id: mypy
files: ^src/
diff --git a/docs/api/agents.md b/docs/api/agents.md
index 9670001b3025c26ba041371a2faded2153b01ea8..8f0fa38939da25884c2dfef878ca84f94c7762fb 100644
--- a/docs/api/agents.md
+++ b/docs/api/agents.md
@@ -262,3 +262,5 @@ def create_input_parser_agent(model: Any | None = None) -> InputParserAgent
+
+
diff --git a/docs/api/models.md b/docs/api/models.md
index 22c35704b4bd1c5b30aea3f60166d594838a7350..f226647a52dc2d324877ce12e9311feffb8df591 100644
--- a/docs/api/models.md
+++ b/docs/api/models.md
@@ -240,3 +240,5 @@ class BudgetStatus(BaseModel):
+
+
diff --git a/docs/api/orchestrators.md b/docs/api/orchestrators.md
index 9c241236c7473b0e48f8e899ecd809553f3f5a8d..27c52249fc18fbcdb893036cdfcb4472e5d2f99e 100644
--- a/docs/api/orchestrators.md
+++ b/docs/api/orchestrators.md
@@ -187,3 +187,5 @@ Runs Magentic orchestration.
+
+
diff --git a/docs/api/services.md b/docs/api/services.md
index f276a342b2f7b998ce5a3a8e0610cc44c315b3cc..30edfc557afb8872d4262c5cdb4ebb2e149f46af 100644
--- a/docs/api/services.md
+++ b/docs/api/services.md
@@ -193,3 +193,5 @@ Analyzes a hypothesis using statistical methods.
+
+
diff --git a/docs/api/tools.md b/docs/api/tools.md
index b86993babad67b25cb06712a3136a69232cd2bbf..b93cd31e37e7a31413fec0ec282424fe6ae0ca82 100644
--- a/docs/api/tools.md
+++ b/docs/api/tools.md
@@ -227,3 +227,5 @@ Searches multiple tools in parallel.
+
+
diff --git a/docs/architecture/agents.md b/docs/architecture/agents.md
index d6599f11288888234009e325f1d20e695d7367fa..b65da9e379c329fc478bf7c9fe3ff4ca4c40745a 100644
--- a/docs/architecture/agents.md
+++ b/docs/architecture/agents.md
@@ -184,3 +184,5 @@ Factory functions:
+
+
diff --git a/docs/architecture/middleware.md b/docs/architecture/middleware.md
index 9d2f570d342774807910f450bceb49f08d79391c..82058ccf979591845b8c5ab87e42913ce8a62458 100644
--- a/docs/architecture/middleware.md
+++ b/docs/architecture/middleware.md
@@ -134,3 +134,5 @@ All middleware components use `ContextVar` for thread-safe isolation:
+
+
diff --git a/docs/architecture/services.md b/docs/architecture/services.md
index 1c9ca8099840c455f8f9d9aeff22151d90f26167..fda7c8367aac5c7f2a907f2c45372a91d7a7fc64 100644
--- a/docs/architecture/services.md
+++ b/docs/architecture/services.md
@@ -134,3 +134,5 @@ if settings.has_openai_key:
+
+
diff --git a/docs/architecture/tools.md b/docs/architecture/tools.md
index e3ab4820b5ca3146939393ea86f0cd56c2fc7e2e..7ddbe7eaaf0a579ddba89c63506ba37560d33405 100644
--- a/docs/architecture/tools.md
+++ b/docs/architecture/tools.md
@@ -167,3 +167,5 @@ search_handler = SearchHandler(
+
+
diff --git a/docs/contributing/code-quality.md b/docs/contributing/code-quality.md
index 003b98aa4aa58b0e6479863860c18db19609546e..b15ec66c60f46d285179fd83f5abc14a695a2a20 100644
--- a/docs/contributing/code-quality.md
+++ b/docs/contributing/code-quality.md
@@ -73,3 +73,5 @@ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
+
+
diff --git a/docs/contributing/code-style.md b/docs/contributing/code-style.md
index 6de664edcf801cad33e4a034a3af85a28b09f9ca..6a0ca8c0d62f7cff541a2abef854ffe49fa89ef8 100644
--- a/docs/contributing/code-style.md
+++ b/docs/contributing/code-style.md
@@ -53,3 +53,5 @@ result = await loop.run_in_executor(None, cpu_bound_function, args)
+
+
diff --git a/docs/contributing/error-handling.md b/docs/contributing/error-handling.md
index b1b55441cde24c94f54f3576d645e6b0731c7348..5d3ead5b23c77d8970f236b460b5668a40a1d566 100644
--- a/docs/contributing/error-handling.md
+++ b/docs/contributing/error-handling.md
@@ -61,3 +61,5 @@ except httpx.HTTPError as e:
+
+
diff --git a/docs/contributing/implementation-patterns.md b/docs/contributing/implementation-patterns.md
index 4f4075561edd03263e723e84cee784927ebc6cb6..d2cf076c39f24f6f42611c9bbd0bcff4ff05ee8a 100644
--- a/docs/contributing/implementation-patterns.md
+++ b/docs/contributing/implementation-patterns.md
@@ -76,3 +76,5 @@ def get_embedding_service() -> EmbeddingService:
+
+
diff --git a/docs/contributing/index.md b/docs/contributing/index.md
index 5c13e76d0b3310847b800160c64e21c232a8bb98..6fab401289f8a568b36096eb201bfe0453b3a6d3 100644
--- a/docs/contributing/index.md
+++ b/docs/contributing/index.md
@@ -155,3 +155,5 @@ Thank you for contributing to DeepCritical!
+
+
diff --git a/docs/contributing/prompt-engineering.md b/docs/contributing/prompt-engineering.md
index d02e67c11b449b0d4c24c54eb796155550f186d8..a1bae2444bb669cddb7d1e3c81081422420ee820 100644
--- a/docs/contributing/prompt-engineering.md
+++ b/docs/contributing/prompt-engineering.md
@@ -61,3 +61,5 @@ This document outlines prompt engineering guidelines and citation validation rul
+
+
diff --git a/docs/contributing/testing.md b/docs/contributing/testing.md
index 393a7f7efc638574a35812ba82f0176f00f89ab1..ebb1b21477c34a34c39cd8d49e1d898b684527ab 100644
--- a/docs/contributing/testing.md
+++ b/docs/contributing/testing.md
@@ -57,3 +57,5 @@ async def test_real_pubmed_search():
+
+
diff --git a/docs/getting-started/examples.md b/docs/getting-started/examples.md
index 214f12f4f5d7b7d4ae8c09ba14af8a43f45ec448..e71e7b8360070341f38f526d1e2df344980e246a 100644
--- a/docs/getting-started/examples.md
+++ b/docs/getting-started/examples.md
@@ -201,3 +201,5 @@ USE_GRAPH_EXECUTION=true
+
+
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index b29e03881c75941b1d034081e434da9fddb544ff..861e1ef751221b4844daad8221430067a71699e1 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -140,3 +140,5 @@ uv run pre-commit install
+
+
diff --git a/docs/getting-started/mcp-integration.md b/docs/getting-started/mcp-integration.md
index 87b2294fca6d956a37b9b47ecf6bceae2d476f94..28cb0806a9b669212221c13367a0326b7de0d14b 100644
--- a/docs/getting-started/mcp-integration.md
+++ b/docs/getting-started/mcp-integration.md
@@ -207,3 +207,5 @@ You can configure multiple DeepCritical instances:
+
+
diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md
index ce36c4b6cc2c5492e12064747b0939895be67107..9c927dbe5cb373d4c4a289ca626d25c72d39610e 100644
--- a/docs/getting-started/quick-start.md
+++ b/docs/getting-started/quick-start.md
@@ -111,3 +111,5 @@ What are the active clinical trials investigating Alzheimer's disease treatments
+
+
diff --git a/docs/license.md b/docs/license.md
index 96da2dd2b44cb7d16e348309109d864255f6c9d4..18466be89051cf1fbcf15385a2eddb2875276a13 100644
--- a/docs/license.md
+++ b/docs/license.md
@@ -31,3 +31,5 @@ SOFTWARE.
+
+
diff --git a/docs/overview/architecture.md b/docs/overview/architecture.md
index 7d66e309012d9a27211f930b07884878ef01c070..e3c55c3d7eda510f0aca206f9113a4fef2055c71 100644
--- a/docs/overview/architecture.md
+++ b/docs/overview/architecture.md
@@ -188,3 +188,5 @@ The system supports complex research workflows through:
+
+
diff --git a/docs/overview/features.md b/docs/overview/features.md
index 9516164162c92122352771ea063e99f4dab70c0e..c5bbe713deee9b4c5e98aed945bd84cfe55da8e5 100644
--- a/docs/overview/features.md
+++ b/docs/overview/features.md
@@ -140,3 +140,5 @@ DeepCritical provides a comprehensive set of features for AI-assisted research:
+
+
diff --git a/docs/team.md b/docs/team.md
index e1a8bf6bfee5b0df95800884d68fd5e0205be006..e6901a846f7dafd627375238c5d4284ad05fe4c5 100644
--- a/docs/team.md
+++ b/docs/team.md
@@ -36,3 +36,5 @@ We welcome contributions! See the [Contributing Guide](contributing/index.md) fo
+
+
diff --git a/pyproject.toml b/pyproject.toml
index 2c0458ecbaeb6b07c27ba1fc61cb498811bcbf97..d262e758d8c6d5581b3ef6aae0123c13b59105bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
"tokenizers>=0.22.0,<=0.23.0",
"transformers>=4.57.2",
"chromadb>=0.4.0",
+ "rpds-py>=0.29.0", # Python implementation of rpds (required by chromadb on Windows)
"sentence-transformers>=2.2.0",
"numpy<2.0",
"agent-framework-core>=1.0.0b251120,<2.0.0",
diff --git a/requirements.txt b/requirements.txt
index 21bdb3ca584609dd2ab695444e7eae639ca34b79..a50255a27c2a7e2568e6328e9f632f125eb609a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,40 +9,53 @@ pydantic>=2.7
pydantic-settings>=2.2
pydantic-ai>=0.0.16
-
# OPTIONAL AI Providers
openai>=1.0.0
-# anthropic>=0.18.0
-
-# Multi-agent orchestration (Advanced mode)
-agent-framework-core>=1.0.0b251120
-
-# Web search
-duckduckgo-search>=5.0
+anthropic>=0.18.0
# HTTP & Parsing
httpx>=0.27
beautifulsoup4>=4.12
xmltodict>=0.13
+# HuggingFace Hub
+huggingface-hub>=0.20.0
+
# UI (Gradio with MCP server support)
-gradio[mcp]>=6.0.0
+gradio[mcp,oauth]>=6.0.0
# Utils
python-dotenv>=1.0
tenacity>=8.2
structlog>=24.1
requests>=2.32.5
-limits>=3.0 # Rate limiting
+limits>=3.0 # Rate limiting
+pydantic-graph>=1.22.0
-# Optional: Modal for code execution
-modal>=0.63.0
+# Web search
+duckduckgo-search>=5.0
-# Optional: LlamaIndex RAG
-llama-index>=0.11.0
-llama-index-llms-openai
-llama-index-llms-huggingface
-llama-index-embeddings-openai
-llama-index-vector-stores-chroma
+# Multi-agent orchestration (Advanced mode)
+agent-framework-core>=1.0.0b251120,<2.0.0
+
+# LlamaIndex RAG
+llama-index-llms-huggingface>=0.6.1
+llama-index-llms-huggingface-api>=0.6.1
+llama-index-vector-stores-chroma>=0.5.3
+llama-index>=0.14.8
+llama-index-llms-openai>=0.6.9
+llama-index-embeddings-openai>=0.5.1
+
+# Embeddings & Vector Store
+tokenizers>=0.22.0,<=0.23.0
+transformers>=4.57.2
chromadb>=0.4.0
+rpds-py>=0.29.0 # Python implementation of rpds (required by chromadb on Windows)
sentence-transformers>=2.2.0
+numpy<2.0
+
+# Optional: Modal for code execution
+modal>=0.63.0
+
+# Pydantic AI with HuggingFace support
+pydantic-ai-slim[huggingface]>=0.0.18
diff --git a/src/agent_factory/judges.py b/src/agent_factory/judges.py
index 8413d678d1994b89e3f16e6a81bc3de4c8981934..9cd0e14eff838d5ca65bc71c348f2be3fc1c5973 100644
--- a/src/agent_factory/judges.py
+++ b/src/agent_factory/judges.py
@@ -8,10 +8,18 @@ from typing import Any
import structlog
from huggingface_hub import InferenceClient
from pydantic_ai import Agent
-from pydantic_ai.models.anthropic import AnthropicModel
from pydantic_ai.models.openai import OpenAIModel # type: ignore[attr-defined]
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+# Try to import AnthropicModel (may not be available if anthropic package is missing)
+try:
+ from pydantic_ai.models.anthropic import AnthropicModel
+
+ _ANTHROPIC_AVAILABLE = True
+except ImportError:
+ AnthropicModel = None # type: ignore[assignment, misc]
+ _ANTHROPIC_AVAILABLE = False
+
# Try to import HuggingFace support (may not be available in all pydantic-ai versions)
# According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
# pydantic-ai with huggingface extra or pydantic-ai-slim[huggingface]
@@ -50,6 +58,11 @@ def get_model() -> Any:
llm_provider = settings.llm_provider
if llm_provider == "anthropic":
+ if not _ANTHROPIC_AVAILABLE:
+ raise ImportError(
+ "Anthropic models are not available. "
+ "Please install with: uv add 'pydantic-ai[anthropic]' or use 'openai'/'huggingface' as the LLM provider."
+ )
return AnthropicModel(settings.anthropic_model, api_key=settings.anthropic_api_key) # type: ignore[call-arg]
if llm_provider == "huggingface":
@@ -144,7 +157,7 @@ class JudgeHandler:
try:
# Run the agent with structured output
result = await self.agent.run(user_prompt)
- assessment = result.output # type: ignore[attr-defined]
+ assessment = result.data
logger.info(
"Assessment complete",
diff --git a/src/agents/hypothesis_agent.py b/src/agents/hypothesis_agent.py
index b806396f36243cf81c6020f3b361a6724e75ea02..d946e7e2c72db190bdbaf6393ddf80bf7004676f 100644
--- a/src/agents/hypothesis_agent.py
+++ b/src/agents/hypothesis_agent.py
@@ -75,7 +75,7 @@ class HypothesisAgent(BaseAgent): # type: ignore[misc]
# Generate hypotheses with diverse evidence selection
prompt = await format_hypothesis_prompt(query, evidence, embeddings=self._embeddings)
result = await self._get_agent().run(prompt)
- assessment = result.output # pydantic-ai returns .output for structured output
+ assessment = result.data # type: ignore[attr-defined]
# Store hypotheses in shared context
existing = self._evidence_store.get("hypotheses", [])
diff --git a/src/agents/input_parser.py b/src/agents/input_parser.py
index 897dd4c31fb4079dadd6d362a69def3813f36318..0f23f7092c8744b5a3429452e8b93919fd6abf88 100644
--- a/src/agents/input_parser.py
+++ b/src/agents/input_parser.py
@@ -92,7 +92,7 @@ class InputParserAgent:
try:
# Run the agent
result = await self.agent.run(user_message)
- parsed_query = result.output
+ parsed_query = result.data
# Validate parsed query
if not parsed_query.original_query:
diff --git a/src/agents/judge_agent_llm.py b/src/agents/judge_agent_llm.py
index 52ab9e5519703b18579de22a770e28a97bad27bd..78447df1f0489ece4002fa01287c3bde6353317f 100644
--- a/src/agents/judge_agent_llm.py
+++ b/src/agents/judge_agent_llm.py
@@ -41,5 +41,5 @@ History of previous attempts: {len(history)}
Evaluate validity and sufficiency."""
run_result = await self.agent.run(prompt)
- logger.info("LLM judge assessment complete", sufficient=run_result.output.sufficient)
- return run_result.output # type: ignore[no-any-return]
+ logger.info("LLM judge assessment complete", sufficient=run_result.data.sufficient) # type: ignore[attr-defined]
+ return run_result.data # type: ignore[no-any-return,attr-defined]
diff --git a/src/agents/knowledge_gap.py b/src/agents/knowledge_gap.py
index 2b4b118b885a1d41238a7a63a54997bcc875a0dc..ad3769d1a403998e908b770525230632fe4aebef 100644
--- a/src/agents/knowledge_gap.py
+++ b/src/agents/knowledge_gap.py
@@ -113,7 +113,7 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
try:
# Run the agent
result = await self.agent.run(user_message)
- evaluation = result.output
+ evaluation = result.data
self.logger.info(
"Knowledge gap evaluation complete",
diff --git a/src/agents/long_writer.py b/src/agents/long_writer.py
index 8b03a5263e4dea685bc8e07023444a5525ca6223..9014d2b0903798be07499db42fa5689cb5a889a5 100644
--- a/src/agents/long_writer.py
+++ b/src/agents/long_writer.py
@@ -176,7 +176,7 @@ class LongWriterAgent:
try:
# Run the agent
result = await self.agent.run(user_message)
- output = result.output
+ output = result.data
# Validate output
if not output or not isinstance(output, LongWriterOutput):
diff --git a/src/agents/proofreader.py b/src/agents/proofreader.py
index 72aeaf77881a9e8498ada3cb288b8edf135ddf4b..3d85ce1405376ff2a54178ddf52e7eaabacfab9b 100644
--- a/src/agents/proofreader.py
+++ b/src/agents/proofreader.py
@@ -133,7 +133,7 @@ REPORT DRAFT:
try:
# Run the agent
result = await self.agent.run(user_message)
- final_report = result.output
+ final_report = result.data # type: ignore[attr-defined]
# Validate output
if not final_report or not final_report.strip():
@@ -142,7 +142,7 @@ REPORT DRAFT:
self.logger.info("Report proofread", length=len(final_report), attempt=attempt + 1)
- return final_report
+ return final_report # type: ignore[no-any-return]
except (TimeoutError, ConnectionError) as e:
# Transient errors - retry
diff --git a/src/agents/report_agent.py b/src/agents/report_agent.py
index 2d86de86ed646f0f5f7ba870e280b993c072dc40..fbff4d948b94a313ff63cf0169b7fefbe3aad110 100644
--- a/src/agents/report_agent.py
+++ b/src/agents/report_agent.py
@@ -91,7 +91,7 @@ class ReportAgent(BaseAgent): # type: ignore[misc]
)
result = await self._get_agent().run(prompt)
- report = result.output
+ report = result.data # type: ignore[attr-defined]
# ═══════════════════════════════════════════════════════════════════
# 🚨 CRITICAL: Validate citations to prevent hallucination
diff --git a/src/agents/thinking.py b/src/agents/thinking.py
index 230c5801fb6bc6822fa155a2b953046a3b5d0729..bf0c84952b14d7e572134758d28ec2dd711a8c2f 100644
--- a/src/agents/thinking.py
+++ b/src/agents/thinking.py
@@ -112,11 +112,11 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
try:
# Run the agent
result = await self.agent.run(user_message)
- observations = result.output
+ observations = result.data # type: ignore[attr-defined]
self.logger.info("Observations generated", length=len(observations))
- return observations
+ return observations # type: ignore[no-any-return]
except Exception as e:
self.logger.error("Observation generation failed", error=str(e))
diff --git a/src/agents/tool_selector.py b/src/agents/tool_selector.py
index 7137906f762786d228bef1a5691712627ee0ff38..dd3aac43006bbd614115b7a687ee2f84e25b5d79 100644
--- a/src/agents/tool_selector.py
+++ b/src/agents/tool_selector.py
@@ -117,7 +117,7 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
try:
# Run the agent
result = await self.agent.run(user_message)
- selection_plan = result.output
+ selection_plan = result.data
self.logger.info(
"Tool selection complete",
diff --git a/src/agents/writer.py b/src/agents/writer.py
index 73690f15f415bea42b8dfeb3d681b786b50b59a5..418a5105527b7ae5a7d91f53e2aed1ac7a0b83df 100644
--- a/src/agents/writer.py
+++ b/src/agents/writer.py
@@ -136,7 +136,7 @@ FINDINGS:
try:
# Run the agent
result = await self.agent.run(user_message)
- report = result.output
+ report = result.data # type: ignore[attr-defined]
# Validate output
if not report or not report.strip():
@@ -145,7 +145,7 @@ FINDINGS:
self.logger.info("Report written", length=len(report), attempt=attempt + 1)
- return report
+ return report # type: ignore[no-any-return]
except (TimeoutError, ConnectionError) as e:
# Transient errors - retry
diff --git a/src/app.py b/src/app.py
index 7275673a06bb7f938288ff5a2b95e9d50927229e..d88d931d1dc17d0a85fc47160d58752f00d5824d 100644
--- a/src/app.py
+++ b/src/app.py
@@ -172,20 +172,29 @@ def event_to_chat_message(event: AgentEvent) -> dict[str, Any]:
"content": event.message,
}
- # Build metadata for accordion
+ # Build metadata for accordion according to Gradio ChatMessage spec
+ # Metadata keys: title (str), status ("pending"|"done"), log (str), duration (float)
+ # See: https://www.gradio.app/guides/agents-and-tool-usage
metadata: dict[str, Any] = {}
+
+ # Title is required for accordion display - must be string
if config["title"]:
- metadata["title"] = config["title"]
+ metadata["title"] = str(config["title"])
# Set status (pending shows spinner, done is collapsed)
+ # Must be exactly "pending" or "done" per Gradio spec
if config["status"] == "pending":
metadata["status"] = "pending"
+ elif config["status"] == "done":
+ metadata["status"] = "done"
- # Add duration if available in data
+ # Add duration if available in data (must be float)
if event.data and isinstance(event.data, dict) and "duration" in event.data:
- metadata["duration"] = event.data["duration"]
+ duration = event.data["duration"]
+ if isinstance(duration, int | float):
+ metadata["duration"] = float(duration)
- # Add log info (iteration number, etc.)
+ # Add log info (iteration number, etc.) - must be string
log_parts: list[str] = []
if event.iteration > 0:
log_parts.append(f"Iteration {event.iteration}")
@@ -198,12 +207,22 @@ def event_to_chat_message(event: AgentEvent) -> dict[str, Any]:
metadata["log"] = " | ".join(log_parts)
# Return as dict format for Gradio Chatbot compatibility
- # Gradio Chatbot expects dict format, not gr.ChatMessage objects
+ # According to Gradio docs: https://www.gradio.app/guides/agents-and-tool-usage
+ # ChatMessage format: {"role": "assistant", "content": "...", "metadata": {...}}
+ # Metadata must have "title" key for accordion display
+ # Valid metadata keys: title (str), status ("pending"|"done"), log (str), duration (float)
result: dict[str, Any] = {
"role": "assistant",
"content": event.message,
}
- if metadata:
+ # Only add metadata if it has a title (required for accordion display)
+ # Ensure metadata values match Gradio's expected types
+ if metadata and metadata.get("title"):
+ # Ensure status is valid if present
+ if "status" in metadata:
+ status = metadata["status"]
+ if status not in ("pending", "done"):
+ metadata["status"] = "done" # Default to "done" if invalid
result["metadata"] = metadata
return result
@@ -455,10 +474,11 @@ async def research_agent(
yield msg
except Exception as e:
+ # Return error message without metadata to avoid issues during example caching
+ # Metadata can cause validation errors when Gradio caches examples
yield {
"role": "assistant",
- "content": f"❌ **Error**: {e!s}",
- "metadata": {"title": "❌ Error", "status": "done"},
+ "content": f"❌ **Error**: {e!s}\n\n*Please check your configuration and try again.*",
}
@@ -681,9 +701,21 @@ def create_demo() -> gr.Blocks:
"**Sign in with HuggingFace** above to access premium models and providers."
),
examples=[
- ["What drugs could be repurposed for Alzheimer's disease?", "simple"],
- ["Is metformin effective for treating cancer?", "simple"],
- ["What medications show promise for Long COVID treatment?", "simple"],
+ # When additional_inputs are provided, examples must be lists of lists
+ # Each inner list: [message, mode, hf_model, hf_provider]
+ [
+ "What drugs could be repurposed for Alzheimer's disease?",
+ "iterative",
+ None,
+ None,
+ ],
+ ["Is metformin effective for treating cancer?", "iterative", None, None],
+ [
+ "What medications show promise for Long COVID treatment?",
+ "iterative",
+ None,
+ None,
+ ],
],
additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False),
additional_inputs=[
diff --git a/src/orchestrator/planner_agent.py b/src/orchestrator/planner_agent.py
index ea560afab9eab23c8fbe22bcd551b63a9b0f4398..110c1bade21c99a72f1c2b3b6d9cb96c2a2b89a5 100644
--- a/src/orchestrator/planner_agent.py
+++ b/src/orchestrator/planner_agent.py
@@ -109,7 +109,7 @@ class PlannerAgent:
try:
# Run the agent
result = await self.agent.run(user_message)
- report_plan = result.output
+ report_plan = result.data
# Validate report plan
if not report_plan.report_outline:
diff --git a/src/services/llamaindex_rag.py b/src/services/llamaindex_rag.py
index 322a6bc4afaa6534431daa96814ef48ee4731f31..00a6da967e191f1f4b2d0dfc4a29fb82f71feed4 100644
--- a/src/services/llamaindex_rag.py
+++ b/src/services/llamaindex_rag.py
@@ -136,7 +136,8 @@ class LlamaIndexRAGService:
}
except ImportError as e:
raise ImportError(
- "LlamaIndex dependencies not installed. Run: uv sync --extra modal"
+ "LlamaIndex dependencies not installed. Required packages: chromadb, llama-index, "
+ "and their dependencies. If rpds is missing, try: uv pip install rpds-py"
) from e
def _configure_embeddings(
diff --git a/src/services/statistical_analyzer.py b/src/services/statistical_analyzer.py
index d43cfa1056ca2248807d54ae50dc7e4dcb025118..09d446c3ba7ebac75c97b6ac8cfa5f606998ea51 100644
--- a/src/services/statistical_analyzer.py
+++ b/src/services/statistical_analyzer.py
@@ -135,7 +135,7 @@ Generate executable Python code to analyze this evidence."""
# Generate code
agent = self._get_agent()
code_result = await agent.run(prompt)
- generated_code = code_result.output
+ generated_code = code_result.data # type: ignore[attr-defined]
# Execute in Modal sandbox
loop = asyncio.get_running_loop()
diff --git a/tests/integration/test_rag_integration.py b/tests/integration/test_rag_integration.py
index 38d3f6ec09900ec3645af00bb181054577c6ce51..6a525bb1ab565c096a04d63316ae393c8ea19421 100644
--- a/tests/integration/test_rag_integration.py
+++ b/tests/integration/test_rag_integration.py
@@ -121,9 +121,12 @@ class TestRAGServiceIntegration:
assert len(response) > 0
assert "python" in response.lower()
except Exception as e:
- # If model is not available (404), skip the test
- if "404" in str(e) or "Not Found" in str(e):
+ # If model is not available (404) or authentication required (401), skip the test
+ error_str = str(e)
+ if "404" in error_str or "Not Found" in error_str:
pytest.skip(f"HuggingFace model not available via inference API: {e}")
+ if "401" in error_str or "Unauthorized" in error_str or "Invalid username or password" in error_str:
+ pytest.skip(f"HuggingFace authentication required but not available: {e}")
raise
# Cleanup
diff --git a/tests/unit/agent_factory/test_judges.py b/tests/unit/agent_factory/test_judges.py
index c2075cdaa3b0d103d5a6b5f5fedb4c0c876356ce..342aa68997467d1fe88d1d31e579ff848d2d3528 100644
--- a/tests/unit/agent_factory/test_judges.py
+++ b/tests/unit/agent_factory/test_judges.py
@@ -34,6 +34,7 @@ class TestJudgeHandler:
# Mock the PydanticAI agent
mock_result = MagicMock()
+ type(mock_result).data = mock_assessment # pydantic-ai uses .data for structured output
mock_result.output = mock_assessment
with (
@@ -88,7 +89,8 @@ class TestJudgeHandler:
)
mock_result = MagicMock()
- mock_result.output = mock_assessment
+ mock_result.data = mock_assessment
+ mock_result.output = mock_assessment # Some code may use .output
with (
patch("src.agent_factory.judges.get_model") as mock_get_model,
diff --git a/tests/unit/agents/test_hypothesis_agent.py b/tests/unit/agents/test_hypothesis_agent.py
index be9b8768b5fed19198ee3d721fff50dd9de8c44c..69772bb1397600b170891a9629499f3aba41bd89 100644
--- a/tests/unit/agents/test_hypothesis_agent.py
+++ b/tests/unit/agents/test_hypothesis_agent.py
@@ -28,18 +28,17 @@ def sample_evidence():
@pytest.fixture
def mock_assessment():
+ primary_hyp = MechanismHypothesis(
+ drug="Metformin",
+ target="AMPK",
+ pathway="mTOR inhibition",
+ effect="Reduced cancer cell proliferation",
+ confidence=0.75,
+ search_suggestions=["metformin AMPK cancer", "mTOR cancer therapy"],
+ )
return HypothesisAssessment(
- hypotheses=[
- MechanismHypothesis(
- drug="Metformin",
- target="AMPK",
- pathway="mTOR inhibition",
- effect="Reduced cancer cell proliferation",
- confidence=0.75,
- search_suggestions=["metformin AMPK cancer", "mTOR cancer therapy"],
- )
- ],
- primary_hypothesis=None,
+ hypotheses=[primary_hyp],
+ primary_hypothesis=primary_hyp, # Set primary hypothesis
knowledge_gaps=["Clinical trial data needed"],
recommended_searches=["metformin clinical trial cancer"],
)
@@ -54,8 +53,9 @@ async def test_hypothesis_agent_generates_hypotheses(sample_evidence, mock_asses
with patch("src.agents.hypothesis_agent.Agent") as mock_agent_class:
mock_get_model.return_value = MagicMock() # Mock model
mock_result = MagicMock()
+ type(mock_result).data = mock_assessment # pydantic-ai uses .data for structured output
mock_result.output = mock_assessment
- # pydantic-ai Agent returns an object with .output for structured output
+ # pydantic-ai Agent returns an object with .data for structured output
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
agent = HypothesisAgent(store)
@@ -94,6 +94,7 @@ async def test_hypothesis_agent_uses_embeddings(sample_evidence, mock_assessment
mock_format.return_value = "Prompt"
mock_result = MagicMock()
+ type(mock_result).data = mock_assessment # pydantic-ai uses .data for structured output
mock_result.output = mock_assessment
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
diff --git a/tests/unit/agents/test_input_parser.py b/tests/unit/agents/test_input_parser.py
index fd4f4a240c4c2387bad38b079952f2c5af04a35c..ea2736a9519875e4905399bbbb33659aafe3674f 100644
--- a/tests/unit/agents/test_input_parser.py
+++ b/tests/unit/agents/test_input_parser.py
@@ -18,6 +18,13 @@ def mock_model() -> MagicMock:
return model
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+ """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+ with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+ yield
+
+
@pytest.fixture
def mock_parsed_query_iterative() -> ParsedQuery:
"""Create a mock ParsedQuery for iterative mode."""
@@ -51,7 +58,9 @@ def mock_agent_result_iterative(
mock_parsed_query_iterative: ParsedQuery,
) -> RunResult[ParsedQuery]:
"""Create a mock agent result for iterative mode."""
- result = MagicMock(spec=RunResult)
+ result = MagicMock()
+ # Configure the mock to return the actual output when .data is accessed
+ type(result).data = mock_parsed_query_iterative
result.output = mock_parsed_query_iterative
return result
@@ -61,7 +70,9 @@ def mock_agent_result_deep(
mock_parsed_query_deep: ParsedQuery,
) -> RunResult[ParsedQuery]:
"""Create a mock agent result for deep mode."""
- result = MagicMock(spec=RunResult)
+ result = MagicMock()
+ # Configure the mock to return the actual output when .data is accessed
+ type(result).data = mock_parsed_query_deep
result.output = mock_parsed_query_deep
return result
@@ -72,33 +83,52 @@ def input_parser_agent(mock_model: MagicMock) -> InputParserAgent:
return InputParserAgent(model=mock_model)
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+ """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+ with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+ yield
+
+
class TestInputParserAgentInit:
"""Test InputParserAgent initialization."""
- def test_input_parser_agent_init_with_model(self, mock_model: MagicMock) -> None:
+ @patch("pydantic_ai.models.infer_model")
+ def test_input_parser_agent_init_with_model(
+ self, mock_infer_model: MagicMock, mock_model: MagicMock
+ ) -> None:
"""Test InputParserAgent initialization with provided model."""
+ mock_infer_model.return_value = mock_model
agent = InputParserAgent(model=mock_model)
assert agent.model == mock_model
assert agent.agent is not None
@patch("src.agents.input_parser.get_model")
+ @patch("pydantic_ai.models.infer_model")
def test_input_parser_agent_init_without_model(
- self, mock_get_model: MagicMock, mock_model: MagicMock
+ self,
+ mock_infer_model: MagicMock,
+ mock_get_model: MagicMock,
+ mock_model: MagicMock,
) -> None:
"""Test InputParserAgent initialization without model (uses default)."""
mock_get_model.return_value = mock_model
+ mock_infer_model.return_value = mock_model
agent = InputParserAgent()
assert agent.model == mock_model
mock_get_model.assert_called_once()
+ @patch("pydantic_ai.models.infer_model")
def test_input_parser_agent_has_correct_system_prompt(
- self, input_parser_agent: InputParserAgent
+ self, mock_infer_model: MagicMock, mock_model: MagicMock
) -> None:
"""Test that InputParserAgent has correct system prompt."""
+ mock_infer_model.return_value = mock_model
+ agent = InputParserAgent(model=mock_model)
# System prompt should contain key instructions
# In pydantic_ai, system_prompt is a property that returns the prompt string
# For mocked agents, we check that the agent was created with a system prompt
- assert input_parser_agent.agent is not None
+ assert agent.agent is not None
# The actual system prompt is set during agent creation
# We verify the agent exists and was properly initialized
# Note: Direct access to system_prompt may not work with mocks
diff --git a/tests/unit/agents/test_long_writer.py b/tests/unit/agents/test_long_writer.py
index 771c27e66b4fa8618cd0a80cdb7d63fdf6d447c7..a2d4bee0c5b62a2c82ef6fd970f4b76b7b676938 100644
--- a/tests/unit/agents/test_long_writer.py
+++ b/tests/unit/agents/test_long_writer.py
@@ -17,6 +17,13 @@ def mock_model() -> MagicMock:
return model
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+ """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+ with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+ yield
+
+
@pytest.fixture
def mock_long_writer_output() -> LongWriterOutput:
"""Create a mock LongWriterOutput."""
@@ -31,7 +38,9 @@ def mock_agent_result(
mock_long_writer_output: LongWriterOutput,
) -> RunResult[LongWriterOutput]:
"""Create a mock agent result."""
- result = MagicMock(spec=RunResult)
+ result = MagicMock()
+ # Configure the mock to return the actual output when .data is accessed
+ type(result).data = mock_long_writer_output
result.output = mock_long_writer_output
return result
@@ -340,9 +349,11 @@ class TestWriteReport:
references=["[1] https://example.com/2"],
)
- result1 = MagicMock(spec=RunResult)
+ result1 = MagicMock()
+ type(result1).data = output1 # pydantic-ai uses .data for structured output
result1.output = output1
- result2 = MagicMock(spec=RunResult)
+ result2 = MagicMock()
+ type(result2).data = output2 # pydantic-ai uses .data for structured output
result2.output = output2
results = [result1, result2]
long_writer_agent.agent.run = AsyncMock(side_effect=results)
diff --git a/tests/unit/agents/test_proofreader.py b/tests/unit/agents/test_proofreader.py
index bb21aa50c6c04981ad6ecd55462ba87a9a675cf3..18eab15d3137732f8a9409b681ceb917d5886917 100644
--- a/tests/unit/agents/test_proofreader.py
+++ b/tests/unit/agents/test_proofreader.py
@@ -18,6 +18,13 @@ def mock_model() -> MagicMock:
return model
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+ """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+ with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+ yield
+
+
@pytest.fixture
def mock_agent_result() -> RunResult[Any]:
"""Create a mock agent result."""
diff --git a/tests/unit/agents/test_report_agent.py b/tests/unit/agents/test_report_agent.py
index 4121dd22c7f389e661dd8d4aa2f85eac5a8b33c5..426519371021df25461219ba8a99559805bb3627 100644
--- a/tests/unit/agents/test_report_agent.py
+++ b/tests/unit/agents/test_report_agent.py
@@ -102,6 +102,7 @@ async def test_report_agent_generates_report(
):
mock_get_model.return_value = MagicMock()
mock_result = MagicMock()
+ type(mock_result).data = mock_report # pydantic-ai uses .data for structured output
mock_result.output = mock_report
mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
diff --git a/tests/unit/agents/test_writer.py b/tests/unit/agents/test_writer.py
index 752ca923c294acc6649cd50a5e788ccc856f12c4..975f7a19bad715319ed1058746454d950f338aeb 100644
--- a/tests/unit/agents/test_writer.py
+++ b/tests/unit/agents/test_writer.py
@@ -18,6 +18,13 @@ def mock_model() -> MagicMock:
return model
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+ """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+ with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+ yield
+
+
@pytest.fixture
def mock_agent_result() -> RunResult[Any]:
"""Create a mock agent result."""
diff --git a/tests/unit/middleware/__init__.py b/tests/unit/middleware/__init__.py
index 6471c91c5166c60c60a919b6cdf145781acfca7d..65aa34a2f83ea7450a83f3ed81fd3fa659075b85 100644
--- a/tests/unit/middleware/__init__.py
+++ b/tests/unit/middleware/__init__.py
@@ -7,3 +7,5 @@
+
+
diff --git a/tests/unit/middleware/test_budget_tracker_phase7.py b/tests/unit/middleware/test_budget_tracker_phase7.py
index 8d881e807fd1aec37400f2f6f0244d8fff7efab2..1821ace8788cf1c2e6ec3410d37a072823a676bb 100644
--- a/tests/unit/middleware/test_budget_tracker_phase7.py
+++ b/tests/unit/middleware/test_budget_tracker_phase7.py
@@ -165,3 +165,5 @@ class TestIterationTokenTracking:
+
+
diff --git a/tests/unit/middleware/test_state_machine.py b/tests/unit/middleware/test_state_machine.py
index 730efc03904c9fbaffe584c041abeafec0eda1e3..b014e65fc2134be57bdbb0dbbc50a7d3da392046 100644
--- a/tests/unit/middleware/test_state_machine.py
+++ b/tests/unit/middleware/test_state_machine.py
@@ -362,3 +362,5 @@ class TestContextVarIsolation:
+
+
diff --git a/tests/unit/middleware/test_workflow_manager.py b/tests/unit/middleware/test_workflow_manager.py
index ebfef154f343e86ce748c301d6a795c2d17e8bd8..8df1c7af664918acd8b9abb6630cf69cf4aac966 100644
--- a/tests/unit/middleware/test_workflow_manager.py
+++ b/tests/unit/middleware/test_workflow_manager.py
@@ -292,3 +292,5 @@ class TestWorkflowManager:
+
+
diff --git a/tests/unit/orchestrator/__init__.py b/tests/unit/orchestrator/__init__.py
index ea3c8051b5f54fa058738b93afd3c2268613d964..f4189d35b5a34adaedb86cd55366a755e2b421ff 100644
--- a/tests/unit/orchestrator/__init__.py
+++ b/tests/unit/orchestrator/__init__.py
@@ -7,3 +7,5 @@
+
+
diff --git a/tests/unit/orchestrator/test_graph_orchestrator.py b/tests/unit/orchestrator/test_graph_orchestrator.py
index 4136663f577f8a52cdbc22b888f8719322d8547e..3aa33202203294d53d986d6e9aa3b4d548bcee3c 100644
--- a/tests/unit/orchestrator/test_graph_orchestrator.py
+++ b/tests/unit/orchestrator/test_graph_orchestrator.py
@@ -209,10 +209,12 @@ class TestGraphOrchestrator:
from src.orchestrator.research_flow import IterativeResearchFlow
# Create flow and patch its run method to raise exception
- original_flow = IterativeResearchFlow(
- max_iterations=2,
- max_time_minutes=5,
- )
+ mock_judge = MagicMock()
+ with patch("src.orchestrator.research_flow.create_judge_handler", return_value=mock_judge):
+ original_flow = IterativeResearchFlow(
+ max_iterations=2,
+ max_time_minutes=5,
+ )
orchestrator._iterative_flow = original_flow
with patch.object(original_flow, "run", side_effect=Exception("Test error")):
diff --git a/tests/unit/orchestrator/test_planner_agent.py b/tests/unit/orchestrator/test_planner_agent.py
index 9d479e7a4af0bf3866b500d6468e07897d275fb0..00142f885da873abfa5ae1741347a683c2612d3b 100644
--- a/tests/unit/orchestrator/test_planner_agent.py
+++ b/tests/unit/orchestrator/test_planner_agent.py
@@ -20,7 +20,7 @@ class TestPlannerAgent:
def mock_agent_run_result(self):
"""Create a mock agent run result."""
mock_result = MagicMock()
- mock_result.output = ReportPlan(
+ report_plan = ReportPlan(
background_context="Python is a programming language.",
report_outline=[
ReportPlanSection(
@@ -34,6 +34,8 @@ class TestPlannerAgent:
],
report_title="Python Programming Language Overview",
)
+ type(mock_result).data = report_plan # pydantic-ai uses .data for structured output
+ mock_result.output = report_plan
return mock_result
@pytest.mark.asyncio
@@ -63,11 +65,13 @@ class TestPlannerAgent:
async def test_planner_agent_handles_empty_outline(self, mock_model):
"""PlannerAgent should return fallback plan when outline is empty."""
mock_result = MagicMock()
- mock_result.output = ReportPlan(
+ report_plan = ReportPlan(
background_context="Some context",
report_outline=[], # Empty outline
report_title="Test Report",
)
+ type(mock_result).data = report_plan # pydantic-ai uses .data for structured output
+ mock_result.output = report_plan
mock_agent = AsyncMock()
mock_agent.run = AsyncMock(return_value=mock_result)
diff --git a/tests/unit/orchestrator/test_research_flow.py b/tests/unit/orchestrator/test_research_flow.py
index c9a8f407ec2027feadddc0df68d615197001b3b4..1d5a11d6b7a06728b3dca1afe451ef4008f78196 100644
--- a/tests/unit/orchestrator/test_research_flow.py
+++ b/tests/unit/orchestrator/test_research_flow.py
@@ -1,6 +1,6 @@
"""Unit tests for ResearchFlow classes."""
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
import pytest
@@ -31,6 +31,27 @@ class TestIterativeResearchFlow:
@pytest.fixture
def flow(self, mock_agents):
"""Create an IterativeResearchFlow with mocked agents."""
+ from src.utils.models import JudgeAssessment, AssessmentDetails
+
+ mock_judge = MagicMock()
+ # Mock judge assessment - default to insufficient so loops continue
+ default_assessment = JudgeAssessment(
+ details=AssessmentDetails(
+ mechanism_score=5,
+ mechanism_reasoning="Test reasoning for mechanism assessment",
+ clinical_evidence_score=5,
+ clinical_reasoning="Test reasoning for clinical evidence assessment",
+ drug_candidates=[],
+ key_findings=[],
+ ),
+ sufficient=False,
+ confidence=0.5,
+ recommendation="continue",
+ next_search_queries=[],
+ reasoning="Test assessment for research flow testing purposes",
+ )
+ mock_judge.assess = AsyncMock(return_value=default_assessment)
+
with (
patch("src.orchestrator.research_flow.create_knowledge_gap_agent") as mock_kg,
patch("src.orchestrator.research_flow.create_tool_selector_agent") as mock_ts,
@@ -38,14 +59,18 @@ class TestIterativeResearchFlow:
patch("src.orchestrator.research_flow.create_writer_agent") as mock_writer,
patch("src.orchestrator.research_flow.execute_tool_tasks") as mock_execute,
patch("src.orchestrator.research_flow.get_rag_service") as mock_rag,
+ patch("src.orchestrator.research_flow.create_judge_handler", return_value=mock_judge),
):
mock_kg.return_value = mock_agents["knowledge_gap"]
mock_ts.return_value = mock_agents["tool_selector"]
mock_thinking.return_value = mock_agents["thinking"]
mock_writer.return_value = mock_agents["writer"]
- mock_execute.return_value = {
- "task_1": ToolAgentOutput(output="Finding 1", sources=["url1"]),
- }
+ # execute_tool_tasks is async, so make the mock async
+ async def mock_execute_async(*args, **kwargs):
+ return {
+ "task_1": ToolAgentOutput(output="Finding 1", sources=["url1"]),
+ }
+ mock_execute.side_effect = mock_execute_async
# Mock RAG service to return None to avoid ChromaDB initialization
mock_rag.return_value = None
@@ -54,6 +79,26 @@ class TestIterativeResearchFlow:
@pytest.mark.asyncio
async def test_iterative_flow_completes_when_research_complete(self, flow, mock_agents):
"""IterativeResearchFlow should complete when research is marked complete."""
+ from src.utils.models import JudgeAssessment, AssessmentDetails
+
+ # Mock judge to return sufficient=True so loop completes
+ sufficient_assessment = JudgeAssessment(
+ details=AssessmentDetails(
+ mechanism_score=8,
+ mechanism_reasoning="Strong evidence for mechanism of action",
+ clinical_evidence_score=7,
+ clinical_reasoning="Good support from clinical studies",
+ drug_candidates=["TestDrug"],
+ key_findings=["Finding 1"],
+ ),
+ sufficient=True,
+ confidence=0.9,
+ recommendation="synthesize",
+ next_search_queries=[],
+ reasoning="Evidence is sufficient",
+ )
+ flow.judge_handler.assess = AsyncMock(return_value=sufficient_assessment)
+
# Mock knowledge gap agent to return complete
mock_agents["knowledge_gap"].evaluate = AsyncMock(
return_value=KnowledgeGapOutput(
@@ -202,10 +247,32 @@ class TestDeepResearchFlow:
@pytest.fixture
def flow(self, mock_agents):
"""Create a DeepResearchFlow with mocked agents."""
+ from src.utils.models import JudgeAssessment, AssessmentDetails
+
+ mock_judge = MagicMock()
+ # Mock judge assessment - default to insufficient so loops continue
+ default_assessment = JudgeAssessment(
+ details=AssessmentDetails(
+ mechanism_score=5,
+ mechanism_reasoning="Test reasoning for mechanism assessment",
+ clinical_evidence_score=5,
+ clinical_reasoning="Test reasoning for clinical evidence assessment",
+ drug_candidates=[],
+ key_findings=[],
+ ),
+ sufficient=False,
+ confidence=0.5,
+ recommendation="continue",
+ next_search_queries=[],
+ reasoning="Test assessment for research flow testing purposes",
+ )
+ mock_judge.assess = AsyncMock(return_value=default_assessment)
+
with (
patch("src.orchestrator.research_flow.create_planner_agent") as mock_planner,
patch("src.orchestrator.research_flow.create_long_writer_agent") as mock_long_writer,
patch("src.orchestrator.research_flow.create_proofreader_agent") as mock_proofreader,
+ patch("src.orchestrator.research_flow.create_judge_handler", return_value=mock_judge),
):
mock_planner.return_value = mock_agents["planner"]
mock_long_writer.return_value = mock_agents["long_writer"]
diff --git a/tests/unit/services/test_statistical_analyzer.py b/tests/unit/services/test_statistical_analyzer.py
index d5b2e39aad7c8e29a3f72d9d8b90c53e7294b4cd..978397530a5948e0c1faeeb31069374424471506 100644
--- a/tests/unit/services/test_statistical_analyzer.py
+++ b/tests/unit/services/test_statistical_analyzer.py
@@ -54,9 +54,10 @@ class TestStatisticalAnalyzer:
patch.object(analyzer, "_get_code_executor") as mock_executor,
):
# Mock LLM
- mock_agent.return_value.run = AsyncMock(
- return_value=MagicMock(output="print('SUPPORTED')")
- )
+ mock_code_result = MagicMock()
+ type(mock_code_result).data = "print('SUPPORTED')" # pydantic-ai uses .data
+ mock_code_result.output = "print('SUPPORTED')"
+ mock_agent.return_value.run = AsyncMock(return_value=mock_code_result)
# Mock Modal
mock_executor.return_value.execute.return_value = {
diff --git a/tests/unit/test_app_smoke.py b/tests/unit/test_app_smoke.py
index 74e88245814f12c1d80af1975ddf25b5b0dd634f..22fbed5f9ecb80ca7d55ea75a5aad3d2c25a3ee9 100644
--- a/tests/unit/test_app_smoke.py
+++ b/tests/unit/test_app_smoke.py
@@ -28,6 +28,11 @@ class TestAppSmoke:
# OAuth dependencies may not be available in test environment
# This is acceptable - OAuth is optional functionality
+ # Also skip if HF_TOKEN is not set (required for Gradio OAuth mocking)
+ import os
+ if not os.getenv("HF_TOKEN"):
+ pytest.skip("HF_TOKEN not set - required for Gradio OAuth mocking in tests")
+
try:
demo = create_demo()
assert demo is not None
@@ -35,6 +40,10 @@ class TestAppSmoke:
if "oauth" in str(e).lower() or "itsdangerous" in str(e).lower():
pytest.skip(f"OAuth dependencies not available: {e}")
raise
+ except ValueError as e:
+ if "HF_TOKEN" in str(e) or "huggingface-cli login" in str(e):
+ pytest.skip(f"HF authentication not available: {e}")
+ raise
def test_mcp_tools_importable(self) -> None:
"""MCP tool functions should be importable.
diff --git a/uv.lock b/uv.lock
index 8d086d578d5116973daac51e7abfc7f81162796d..797f9971f239a55a0d02a3ccf246dbb9f226f2da 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1032,6 +1032,7 @@ dependencies = [
{ name = "pytest-cov" },
{ name = "python-dotenv" },
{ name = "requests" },
+ { name = "rpds-py" },
{ name = "sentence-transformers" },
{ name = "structlog" },
{ name = "tenacity" },
@@ -1105,6 +1106,7 @@ requires-dist = [
{ name = "python-dotenv", specifier = ">=1.0" },
{ name = "requests", specifier = ">=2.32.5" },
{ name = "respx", marker = "extra == 'dev'", specifier = ">=0.22.0" },
+ { name = "rpds-py", specifier = ">=0.29.0" },
{ name = "ruff", marker = "extra == 'dev'", specifier = ">=0.14.6" },
{ name = "sentence-transformers", specifier = ">=2.2.0" },
{ name = "structlog", specifier = ">=24.1" },