diff --git a/.env copy.example b/.env copy.example
deleted file mode 100644
index b8061357538326dd7fad717c627cdcfa5c0b3eb9..0000000000000000000000000000000000000000
--- a/.env copy.example	
+++ /dev/null
@@ -1,124 +0,0 @@
-# ============== LLM CONFIGURATION ==============
-
-# Provider: "openai", "anthropic", or "huggingface"
-LLM_PROVIDER=openai
-
-# API Keys (at least one required for full LLM analysis)
-OPENAI_API_KEY=sk-your-key-here
-ANTHROPIC_API_KEY=sk-ant-your-key-here
-
-# Model names (optional - sensible defaults set in config.py)
-# OPENAI_MODEL=gpt-5.1
-# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
-
-# ============== HUGGINGFACE CONFIGURATION ==============
-
-# HuggingFace Token - enables gated models and higher rate limits
-# Get yours at: https://huggingface.co/settings/tokens
-# 
-# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta, Qwen2-7B)
-# WITH HF_TOKEN: Uses gated models (Llama 3.1, Gemma-2) via inference providers
-#
-# For HuggingFace Spaces deployment:
-#   Set this as a "Secret" in Space Settings -> Variables and secrets
-#   Users/judges don't need their own token - the Space secret is used
-#
-HF_TOKEN=hf_your-token-here
-# Alternative: HUGGINGFACE_API_KEY (same as HF_TOKEN)
-
-# Default HuggingFace model for inference (gated, requires auth)
-# Can be overridden in UI dropdown
-# Latest reasoning models: Qwen3-Next-80B-A3B-Thinking, Qwen3-Next-80B-A3B-Instruct, Llama-3.3-70B-Instruct
-HUGGINGFACE_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
-
-# Fallback models for HuggingFace Inference API (comma-separated)
-# Models are tried in order until one succeeds
-# Format: model1,model2,model3
-# Latest reasoning models first, then reliable fallbacks
-# Reasoning models: Qwen3-Next (thinking/instruct), Llama-3.3-70B, Qwen3-235B
-# Fallbacks: Llama-3.1-8B, Zephyr-7B (ungated), Qwen2-7B (ungated)
-HF_FALLBACK_MODELS=Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen/Qwen3-Next-80B-A3B-Instruct,meta-llama/Llama-3.3-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,HuggingFaceH4/zephyr-7b-beta,Qwen/Qwen2-7B-Instruct
-
-# Override model/provider selection (optional, usually set via UI)
-# HF_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
-# HF_PROVIDER=hyperbolic
-
-# ============== EMBEDDING CONFIGURATION ==============
-
-# Embedding Provider: "openai", "local", or "huggingface"
-# Default: "local" (no API key required)
-EMBEDDING_PROVIDER=local
-
-# OpenAI Embedding Model (used if EMBEDDING_PROVIDER=openai)
-OPENAI_EMBEDDING_MODEL=text-embedding-3-small
-
-# Local Embedding Model (sentence-transformers, used if EMBEDDING_PROVIDER=local)
-# BAAI/bge-small-en-v1.5 is newer, faster, and better than all-MiniLM-L6-v2
-LOCAL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
-
-# HuggingFace Embedding Model (used if EMBEDDING_PROVIDER=huggingface)
-HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
-
-# ============== AGENT CONFIGURATION ==============
-
-MAX_ITERATIONS=10
-SEARCH_TIMEOUT=30
-LOG_LEVEL=INFO
-
-# Graph-based execution (experimental)
-# USE_GRAPH_EXECUTION=false
-
-# Budget & Rate Limiting
-# DEFAULT_TOKEN_LIMIT=100000
-# DEFAULT_TIME_LIMIT_MINUTES=10
-# DEFAULT_ITERATIONS_LIMIT=10
-
-# ============== WEB SEARCH CONFIGURATION ==============
-
-# Web Search Provider: "serper", "searchxng", "brave", "tavily", or "duckduckgo"
-# Default: "duckduckgo" (no API key required)
-WEB_SEARCH_PROVIDER=duckduckgo
-
-# Serper API Key (for Google search via Serper)
-# SERPER_API_KEY=your-serper-key-here
-
-# SearchXNG Host URL (for self-hosted search)
-# SEARCHXNG_HOST=http://localhost:8080
-
-# Brave Search API Key
-# BRAVE_API_KEY=your-brave-key-here
-
-# Tavily API Key
-# TAVILY_API_KEY=your-tavily-key-here
-
-# ============== EXTERNAL SERVICES ==============
-
-# PubMed (optional - higher rate limits: 10 req/sec vs 3 req/sec)
-NCBI_API_KEY=your-ncbi-key-here
-
-# Modal (optional - for secure code execution sandbox)
-# MODAL_TOKEN_ID=your-modal-token-id
-# MODAL_TOKEN_SECRET=your-modal-token-secret
-
-# ============== VECTOR DATABASE (ChromaDB) ==============
-
-# ChromaDB storage path
-CHROMA_DB_PATH=./chroma_db
-
-# Persist ChromaDB to disk (default: true)
-# CHROMA_DB_PERSIST=true
-
-# Remote ChromaDB server (optional)
-# CHROMA_DB_HOST=localhost
-# CHROMA_DB_PORT=8000
-
-# ============== RAG SERVICE CONFIGURATION ==============
-
-# ChromaDB collection name for RAG
-# RAG_COLLECTION_NAME=deepcritical_evidence
-
-# Number of top results to retrieve from RAG
-# RAG_SIMILARITY_TOP_K=5
-
-# Automatically ingest evidence into RAG
-# RAG_AUTO_INGEST=true
diff --git a/.env.example b/.env.example
index cfea522c8e49c8e8de6145965e6269cbd616b788..b8061357538326dd7fad717c627cdcfa5c0b3eb9 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,6 @@
 # ============== LLM CONFIGURATION ==============
 
-# Provider: "openai" or "anthropic"
+# Provider: "openai", "anthropic", or "huggingface"
 LLM_PROVIDER=openai
 
 # API Keys (at least one required for full LLM analysis)
@@ -8,30 +8,56 @@ OPENAI_API_KEY=sk-your-key-here
 ANTHROPIC_API_KEY=sk-ant-your-key-here
 
 # Model names (optional - sensible defaults set in config.py)
-# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
 # OPENAI_MODEL=gpt-5.1
+# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
 
-# ============== EMBEDDINGS ==============
-
-# OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
-OPENAI_EMBEDDING_MODEL=text-embedding-3-small
-
-# Local Embedding Model (used for local/offline embeddings)
-LOCAL_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
-
-# ============== HUGGINGFACE (FREE TIER) ==============
+# ============== HUGGINGFACE CONFIGURATION ==============
 
-# HuggingFace Token - enables Llama 3.1 (best quality free model)
+# HuggingFace Token - enables gated models and higher rate limits
 # Get yours at: https://huggingface.co/settings/tokens
-#
-# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta)
-# WITH HF_TOKEN: Uses Llama 3.1 8B Instruct (requires accepting license)
+# 
+# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta, Qwen2-7B)
+# WITH HF_TOKEN: Uses gated models (Llama 3.1, Gemma-2) via inference providers
 #
 # For HuggingFace Spaces deployment:
 #   Set this as a "Secret" in Space Settings -> Variables and secrets
 #   Users/judges don't need their own token - the Space secret is used
 #
 HF_TOKEN=hf_your-token-here
+# Alternative: HUGGINGFACE_API_KEY (same as HF_TOKEN)
+
+# Default HuggingFace model for inference (gated, requires auth)
+# Can be overridden in UI dropdown
+# Latest reasoning models: Qwen3-Next-80B-A3B-Thinking, Qwen3-Next-80B-A3B-Instruct, Llama-3.3-70B-Instruct
+HUGGINGFACE_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
+
+# Fallback models for HuggingFace Inference API (comma-separated)
+# Models are tried in order until one succeeds
+# Format: model1,model2,model3
+# Latest reasoning models first, then reliable fallbacks
+# Reasoning models: Qwen3-Next (thinking/instruct), Llama-3.3-70B, Qwen3-235B
+# Fallbacks: Llama-3.1-8B, Zephyr-7B (ungated), Qwen2-7B (ungated)
+HF_FALLBACK_MODELS=Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen/Qwen3-Next-80B-A3B-Instruct,meta-llama/Llama-3.3-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,HuggingFaceH4/zephyr-7b-beta,Qwen/Qwen2-7B-Instruct
+
+# Override model/provider selection (optional, usually set via UI)
+# HF_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
+# HF_PROVIDER=hyperbolic
+
+# ============== EMBEDDING CONFIGURATION ==============
+
+# Embedding Provider: "openai", "local", or "huggingface"
+# Default: "local" (no API key required)
+EMBEDDING_PROVIDER=local
+
+# OpenAI Embedding Model (used if EMBEDDING_PROVIDER=openai)
+OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+
+# Local Embedding Model (sentence-transformers, used if EMBEDDING_PROVIDER=local)
+# BAAI/bge-small-en-v1.5 is newer, faster, and better than all-MiniLM-L6-v2
+LOCAL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
+
+# HuggingFace Embedding Model (used if EMBEDDING_PROVIDER=huggingface)
+HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 
 # ============== AGENT CONFIGURATION ==============
 
@@ -39,10 +65,60 @@ MAX_ITERATIONS=10
 SEARCH_TIMEOUT=30
 LOG_LEVEL=INFO
 
+# Graph-based execution (experimental)
+# USE_GRAPH_EXECUTION=false
+
+# Budget & Rate Limiting
+# DEFAULT_TOKEN_LIMIT=100000
+# DEFAULT_TIME_LIMIT_MINUTES=10
+# DEFAULT_ITERATIONS_LIMIT=10
+
+# ============== WEB SEARCH CONFIGURATION ==============
+
+# Web Search Provider: "serper", "searchxng", "brave", "tavily", or "duckduckgo"
+# Default: "duckduckgo" (no API key required)
+WEB_SEARCH_PROVIDER=duckduckgo
+
+# Serper API Key (for Google search via Serper)
+# SERPER_API_KEY=your-serper-key-here
+
+# SearchXNG Host URL (for self-hosted search)
+# SEARCHXNG_HOST=http://localhost:8080
+
+# Brave Search API Key
+# BRAVE_API_KEY=your-brave-key-here
+
+# Tavily API Key
+# TAVILY_API_KEY=your-tavily-key-here
+
 # ============== EXTERNAL SERVICES ==============
 
-# PubMed (optional - higher rate limits)
+# PubMed (optional - higher rate limits: 10 req/sec vs 3 req/sec)
 NCBI_API_KEY=your-ncbi-key-here
 
-# Vector Database (optional - for LlamaIndex RAG)
+# Modal (optional - for secure code execution sandbox)
+# MODAL_TOKEN_ID=your-modal-token-id
+# MODAL_TOKEN_SECRET=your-modal-token-secret
+
+# ============== VECTOR DATABASE (ChromaDB) ==============
+
+# ChromaDB storage path
 CHROMA_DB_PATH=./chroma_db
+
+# Persist ChromaDB to disk (default: true)
+# CHROMA_DB_PERSIST=true
+
+# Remote ChromaDB server (optional)
+# CHROMA_DB_HOST=localhost
+# CHROMA_DB_PORT=8000
+
+# ============== RAG SERVICE CONFIGURATION ==============
+
+# ChromaDB collection name for RAG
+# RAG_COLLECTION_NAME=deepcritical_evidence
+
+# Number of top results to retrieve from RAG
+# RAG_SIMILARITY_TOP_K=5
+
+# Automatically ingest evidence into RAG
+# RAG_AUTO_INGEST=true
diff --git a/.github/README.md b/.github/README.md
index 7f9634bf5d792f81f450f88f0d607ab1fe3f2956..c573b60ebe7f6f8264b6e31d32793100968a1965 100644
--- a/.github/README.md
+++ b/.github/README.md
@@ -1,28 +1,3 @@
----
-title: DeepCritical
-emoji: 🧬
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: "6.0.1"
-python_version: "3.11"
-app_file: src/app.py
-hf_oauth: true
-hf_oauth_expiration_minutes: 480
-hf_oauth_scopes:
- - inference-api
-pinned: true
-license: mit
-tags:
-  - mcp-in-action-track-enterprise
-  - mcp-hackathon
-  - drug-repurposing
-  - biomedical-ai
-  - pydantic-ai
-  - llamaindex
-  - modal
----
-
 <div align="center">
 
 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cfeb4ebd2ef145b6f7da1af5e790d430bae99eff..4481459350e12a71337b5c32f804f13a24b33c62 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,19 +33,19 @@ jobs:
       - name: Lint with ruff
         continue-on-error: true
         run: |
-          uv run ruff check . --exclude tests
-          uv run ruff format --check . --exclude tests
+          uv run ruff check . --exclude tests --exclude reference_repos
+          uv run ruff format --check . --exclude tests --exclude reference_repos
 
       - name: Type check with mypy
         continue-on-error: true
         run: |
-          uv run mypy src
+          uv run mypy src --ignore-missing-imports
 
-      - name: Run unit tests (No Black Box Apis)
+      - name: Run unit tests (No OpenAI/Anthropic, HuggingFace only)
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          uv run pytest tests/unit/ -v -m "not openai and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml
+          uv run pytest tests/unit/ -v -m "not openai and not anthropic and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml
 
       - name: Run local embeddings tests
         env:
@@ -61,11 +61,11 @@ jobs:
           uv run pytest tests/integration/ -v -m "huggingface and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
         continue-on-error: true  # Allow failures if HF_TOKEN not set
 
-      - name: Run non-OpenAI integration tests (excluding embedding providers)
+      - name: Run non-OpenAI/Anthropic integration tests (excluding embedding providers)
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          uv run pytest tests/integration/ -v -m "integration and not openai and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
+          uv run pytest tests/integration/ -v -m "integration and not openai and not anthropic and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
         continue-on-error: true  # Allow failures if dependencies not available
 
       - name: Upload coverage reports to Codecov
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d08dd3bf813709c4c4df5a8fc5f6ebdb16c84f3..66993b5ec97b1bfa659fc9cdc9b3a323372d56ee 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,16 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.4
+    rev: v0.14.7  # Compatible with ruff>=0.14.6 (matches CI)
     hooks:
       - id: ruff
-        args: [--fix, --exclude, tests]
+        args: [--fix, --exclude, tests, --exclude, reference_repos]
         exclude: ^reference_repos/
       - id: ruff-format
-        args: [--exclude, tests]
+        args: [--exclude, tests, --exclude, reference_repos]
         exclude: ^reference_repos/
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.0
+    rev: v1.18.2  # Matches CI version mypy>=1.18.2
     hooks:
       - id: mypy
         files: ^src/
diff --git a/docs/api/agents.md b/docs/api/agents.md
index 9670001b3025c26ba041371a2faded2153b01ea8..8f0fa38939da25884c2dfef878ca84f94c7762fb 100644
--- a/docs/api/agents.md
+++ b/docs/api/agents.md
@@ -262,3 +262,5 @@ def create_input_parser_agent(model: Any | None = None) -> InputParserAgent
 
 
 
+
+
diff --git a/docs/api/models.md b/docs/api/models.md
index 22c35704b4bd1c5b30aea3f60166d594838a7350..f226647a52dc2d324877ce12e9311feffb8df591 100644
--- a/docs/api/models.md
+++ b/docs/api/models.md
@@ -240,3 +240,5 @@ class BudgetStatus(BaseModel):
 
 
 
+
+
diff --git a/docs/api/orchestrators.md b/docs/api/orchestrators.md
index 9c241236c7473b0e48f8e899ecd809553f3f5a8d..27c52249fc18fbcdb893036cdfcb4472e5d2f99e 100644
--- a/docs/api/orchestrators.md
+++ b/docs/api/orchestrators.md
@@ -187,3 +187,5 @@ Runs Magentic orchestration.
 
 
 
+
+
diff --git a/docs/api/services.md b/docs/api/services.md
index f276a342b2f7b998ce5a3a8e0610cc44c315b3cc..30edfc557afb8872d4262c5cdb4ebb2e149f46af 100644
--- a/docs/api/services.md
+++ b/docs/api/services.md
@@ -193,3 +193,5 @@ Analyzes a hypothesis using statistical methods.
 
 
 
+
+
diff --git a/docs/api/tools.md b/docs/api/tools.md
index b86993babad67b25cb06712a3136a69232cd2bbf..b93cd31e37e7a31413fec0ec282424fe6ae0ca82 100644
--- a/docs/api/tools.md
+++ b/docs/api/tools.md
@@ -227,3 +227,5 @@ Searches multiple tools in parallel.
 
 
 
+
+
diff --git a/docs/architecture/agents.md b/docs/architecture/agents.md
index d6599f11288888234009e325f1d20e695d7367fa..b65da9e379c329fc478bf7c9fe3ff4ca4c40745a 100644
--- a/docs/architecture/agents.md
+++ b/docs/architecture/agents.md
@@ -184,3 +184,5 @@ Factory functions:
 
 
 
+
+
diff --git a/docs/architecture/middleware.md b/docs/architecture/middleware.md
index 9d2f570d342774807910f450bceb49f08d79391c..82058ccf979591845b8c5ab87e42913ce8a62458 100644
--- a/docs/architecture/middleware.md
+++ b/docs/architecture/middleware.md
@@ -134,3 +134,5 @@ All middleware components use `ContextVar` for thread-safe isolation:
 
 
 
+
+
diff --git a/docs/architecture/services.md b/docs/architecture/services.md
index 1c9ca8099840c455f8f9d9aeff22151d90f26167..fda7c8367aac5c7f2a907f2c45372a91d7a7fc64 100644
--- a/docs/architecture/services.md
+++ b/docs/architecture/services.md
@@ -134,3 +134,5 @@ if settings.has_openai_key:
 
 
 
+
+
diff --git a/docs/architecture/tools.md b/docs/architecture/tools.md
index e3ab4820b5ca3146939393ea86f0cd56c2fc7e2e..7ddbe7eaaf0a579ddba89c63506ba37560d33405 100644
--- a/docs/architecture/tools.md
+++ b/docs/architecture/tools.md
@@ -167,3 +167,5 @@ search_handler = SearchHandler(
 
 
 
+
+
diff --git a/docs/contributing/code-quality.md b/docs/contributing/code-quality.md
index 003b98aa4aa58b0e6479863860c18db19609546e..b15ec66c60f46d285179fd83f5abc14a695a2a20 100644
--- a/docs/contributing/code-quality.md
+++ b/docs/contributing/code-quality.md
@@ -73,3 +73,5 @@ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
 
 
 
+
+
diff --git a/docs/contributing/code-style.md b/docs/contributing/code-style.md
index 6de664edcf801cad33e4a034a3af85a28b09f9ca..6a0ca8c0d62f7cff541a2abef854ffe49fa89ef8 100644
--- a/docs/contributing/code-style.md
+++ b/docs/contributing/code-style.md
@@ -53,3 +53,5 @@ result = await loop.run_in_executor(None, cpu_bound_function, args)
 
 
 
+
+
diff --git a/docs/contributing/error-handling.md b/docs/contributing/error-handling.md
index b1b55441cde24c94f54f3576d645e6b0731c7348..5d3ead5b23c77d8970f236b460b5668a40a1d566 100644
--- a/docs/contributing/error-handling.md
+++ b/docs/contributing/error-handling.md
@@ -61,3 +61,5 @@ except httpx.HTTPError as e:
 
 
 
+
+
diff --git a/docs/contributing/implementation-patterns.md b/docs/contributing/implementation-patterns.md
index 4f4075561edd03263e723e84cee784927ebc6cb6..d2cf076c39f24f6f42611c9bbd0bcff4ff05ee8a 100644
--- a/docs/contributing/implementation-patterns.md
+++ b/docs/contributing/implementation-patterns.md
@@ -76,3 +76,5 @@ def get_embedding_service() -> EmbeddingService:
 
 
 
+
+
diff --git a/docs/contributing/index.md b/docs/contributing/index.md
index 5c13e76d0b3310847b800160c64e21c232a8bb98..6fab401289f8a568b36096eb201bfe0453b3a6d3 100644
--- a/docs/contributing/index.md
+++ b/docs/contributing/index.md
@@ -155,3 +155,5 @@ Thank you for contributing to DeepCritical!
 
 
 
+
+
diff --git a/docs/contributing/prompt-engineering.md b/docs/contributing/prompt-engineering.md
index d02e67c11b449b0d4c24c54eb796155550f186d8..a1bae2444bb669cddb7d1e3c81081422420ee820 100644
--- a/docs/contributing/prompt-engineering.md
+++ b/docs/contributing/prompt-engineering.md
@@ -61,3 +61,5 @@ This document outlines prompt engineering guidelines and citation validation rul
 
 
 
+
+
diff --git a/docs/contributing/testing.md b/docs/contributing/testing.md
index 393a7f7efc638574a35812ba82f0176f00f89ab1..ebb1b21477c34a34c39cd8d49e1d898b684527ab 100644
--- a/docs/contributing/testing.md
+++ b/docs/contributing/testing.md
@@ -57,3 +57,5 @@ async def test_real_pubmed_search():
 
 
 
+
+
diff --git a/docs/getting-started/examples.md b/docs/getting-started/examples.md
index 214f12f4f5d7b7d4ae8c09ba14af8a43f45ec448..e71e7b8360070341f38f526d1e2df344980e246a 100644
--- a/docs/getting-started/examples.md
+++ b/docs/getting-started/examples.md
@@ -201,3 +201,5 @@ USE_GRAPH_EXECUTION=true
 
 
 
+
+
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index b29e03881c75941b1d034081e434da9fddb544ff..861e1ef751221b4844daad8221430067a71699e1 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -140,3 +140,5 @@ uv run pre-commit install
 
 
 
+
+
diff --git a/docs/getting-started/mcp-integration.md b/docs/getting-started/mcp-integration.md
index 87b2294fca6d956a37b9b47ecf6bceae2d476f94..28cb0806a9b669212221c13367a0326b7de0d14b 100644
--- a/docs/getting-started/mcp-integration.md
+++ b/docs/getting-started/mcp-integration.md
@@ -207,3 +207,5 @@ You can configure multiple DeepCritical instances:
 
 
 
+
+
diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md
index ce36c4b6cc2c5492e12064747b0939895be67107..9c927dbe5cb373d4c4a289ca626d25c72d39610e 100644
--- a/docs/getting-started/quick-start.md
+++ b/docs/getting-started/quick-start.md
@@ -111,3 +111,5 @@ What are the active clinical trials investigating Alzheimer's disease treatments
 
 
 
+
+
diff --git a/docs/license.md b/docs/license.md
index 96da2dd2b44cb7d16e348309109d864255f6c9d4..18466be89051cf1fbcf15385a2eddb2875276a13 100644
--- a/docs/license.md
+++ b/docs/license.md
@@ -31,3 +31,5 @@ SOFTWARE.
 
 
 
+
+
diff --git a/docs/overview/architecture.md b/docs/overview/architecture.md
index 7d66e309012d9a27211f930b07884878ef01c070..e3c55c3d7eda510f0aca206f9113a4fef2055c71 100644
--- a/docs/overview/architecture.md
+++ b/docs/overview/architecture.md
@@ -188,3 +188,5 @@ The system supports complex research workflows through:
 
 
 
+
+
diff --git a/docs/overview/features.md b/docs/overview/features.md
index 9516164162c92122352771ea063e99f4dab70c0e..c5bbe713deee9b4c5e98aed945bd84cfe55da8e5 100644
--- a/docs/overview/features.md
+++ b/docs/overview/features.md
@@ -140,3 +140,5 @@ DeepCritical provides a comprehensive set of features for AI-assisted research:
 
 
 
+
+
diff --git a/docs/team.md b/docs/team.md
index e1a8bf6bfee5b0df95800884d68fd5e0205be006..e6901a846f7dafd627375238c5d4284ad05fe4c5 100644
--- a/docs/team.md
+++ b/docs/team.md
@@ -36,3 +36,5 @@ We welcome contributions! See the [Contributing Guide](contributing/index.md) fo
 
 
 
+
+
diff --git a/pyproject.toml b/pyproject.toml
index 2c0458ecbaeb6b07c27ba1fc61cb498811bcbf97..d262e758d8c6d5581b3ef6aae0123c13b59105bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
     "tokenizers>=0.22.0,<=0.23.0",
     "transformers>=4.57.2",
     "chromadb>=0.4.0",
+    "rpds-py>=0.29.0",  # Python implementation of rpds (required by chromadb on Windows)
     "sentence-transformers>=2.2.0",
     "numpy<2.0",
     "agent-framework-core>=1.0.0b251120,<2.0.0",
diff --git a/requirements.txt b/requirements.txt
index 21bdb3ca584609dd2ab695444e7eae639ca34b79..a50255a27c2a7e2568e6328e9f632f125eb609a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,40 +9,53 @@ pydantic>=2.7
 pydantic-settings>=2.2
 pydantic-ai>=0.0.16
 
-
 # OPTIONAL AI Providers
 openai>=1.0.0
-# anthropic>=0.18.0
-
-# Multi-agent orchestration (Advanced mode)
-agent-framework-core>=1.0.0b251120
-
-# Web search
-duckduckgo-search>=5.0
+anthropic>=0.18.0
 
 # HTTP & Parsing
 httpx>=0.27
 beautifulsoup4>=4.12
 xmltodict>=0.13
 
+# HuggingFace Hub
+huggingface-hub>=0.20.0
+
 # UI (Gradio with MCP server support)
-gradio[mcp]>=6.0.0
+gradio[mcp,oauth]>=6.0.0
 
 # Utils
 python-dotenv>=1.0
 tenacity>=8.2
 structlog>=24.1
 requests>=2.32.5
-limits>=3.0  # Rate limiting 
+limits>=3.0  # Rate limiting
+pydantic-graph>=1.22.0
 
-# Optional: Modal for code execution
-modal>=0.63.0
+# Web search
+duckduckgo-search>=5.0
 
-# Optional: LlamaIndex RAG
-llama-index>=0.11.0
-llama-index-llms-openai
-llama-index-llms-huggingface 
-llama-index-embeddings-openai
-llama-index-vector-stores-chroma
+# Multi-agent orchestration (Advanced mode)
+agent-framework-core>=1.0.0b251120,<2.0.0
+
+# LlamaIndex RAG
+llama-index-llms-huggingface>=0.6.1
+llama-index-llms-huggingface-api>=0.6.1
+llama-index-vector-stores-chroma>=0.5.3
+llama-index>=0.14.8
+llama-index-llms-openai>=0.6.9
+llama-index-embeddings-openai>=0.5.1
+
+# Embeddings & Vector Store
+tokenizers>=0.22.0,<=0.23.0
+transformers>=4.57.2
 chromadb>=0.4.0
+rpds-py>=0.29.0  # Python implementation of rpds (required by chromadb on Windows)
 sentence-transformers>=2.2.0
+numpy<2.0
+
+# Optional: Modal for code execution
+modal>=0.63.0
+
+# Pydantic AI with HuggingFace support
+pydantic-ai-slim[huggingface]>=0.0.18
diff --git a/src/agent_factory/judges.py b/src/agent_factory/judges.py
index 8413d678d1994b89e3f16e6a81bc3de4c8981934..9cd0e14eff838d5ca65bc71c348f2be3fc1c5973 100644
--- a/src/agent_factory/judges.py
+++ b/src/agent_factory/judges.py
@@ -8,10 +8,18 @@ from typing import Any
 import structlog
 from huggingface_hub import InferenceClient
 from pydantic_ai import Agent
-from pydantic_ai.models.anthropic import AnthropicModel
 from pydantic_ai.models.openai import OpenAIModel  # type: ignore[attr-defined]
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 
+# Try to import AnthropicModel (may not be available if anthropic package is missing)
+try:
+    from pydantic_ai.models.anthropic import AnthropicModel
+
+    _ANTHROPIC_AVAILABLE = True
+except ImportError:
+    AnthropicModel = None  # type: ignore[assignment, misc]
+    _ANTHROPIC_AVAILABLE = False
+
 # Try to import HuggingFace support (may not be available in all pydantic-ai versions)
 # According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
 # pydantic-ai with huggingface extra or pydantic-ai-slim[huggingface]
@@ -50,6 +58,11 @@ def get_model() -> Any:
     llm_provider = settings.llm_provider
 
     if llm_provider == "anthropic":
+        if not _ANTHROPIC_AVAILABLE:
+            raise ImportError(
+                "Anthropic models are not available. "
+                "Please install with: uv add 'pydantic-ai[anthropic]' or use 'openai'/'huggingface' as the LLM provider."
+            )
         return AnthropicModel(settings.anthropic_model, api_key=settings.anthropic_api_key)  # type: ignore[call-arg]
 
     if llm_provider == "huggingface":
@@ -144,7 +157,7 @@ class JudgeHandler:
         try:
             # Run the agent with structured output
             result = await self.agent.run(user_prompt)
-            assessment = result.output  # type: ignore[attr-defined]
+            assessment = result.data
 
             logger.info(
                 "Assessment complete",
diff --git a/src/agents/hypothesis_agent.py b/src/agents/hypothesis_agent.py
index b806396f36243cf81c6020f3b361a6724e75ea02..d946e7e2c72db190bdbaf6393ddf80bf7004676f 100644
--- a/src/agents/hypothesis_agent.py
+++ b/src/agents/hypothesis_agent.py
@@ -75,7 +75,7 @@ class HypothesisAgent(BaseAgent):  # type: ignore[misc]
         # Generate hypotheses with diverse evidence selection
         prompt = await format_hypothesis_prompt(query, evidence, embeddings=self._embeddings)
         result = await self._get_agent().run(prompt)
-        assessment = result.output  # pydantic-ai returns .output for structured output
+        assessment = result.data  # type: ignore[attr-defined]
 
         # Store hypotheses in shared context
         existing = self._evidence_store.get("hypotheses", [])
diff --git a/src/agents/input_parser.py b/src/agents/input_parser.py
index 897dd4c31fb4079dadd6d362a69def3813f36318..0f23f7092c8744b5a3429452e8b93919fd6abf88 100644
--- a/src/agents/input_parser.py
+++ b/src/agents/input_parser.py
@@ -92,7 +92,7 @@ class InputParserAgent:
         try:
             # Run the agent
             result = await self.agent.run(user_message)
-            parsed_query = result.output
+            parsed_query = result.data
 
             # Validate parsed query
             if not parsed_query.original_query:
diff --git a/src/agents/judge_agent_llm.py b/src/agents/judge_agent_llm.py
index 52ab9e5519703b18579de22a770e28a97bad27bd..78447df1f0489ece4002fa01287c3bde6353317f 100644
--- a/src/agents/judge_agent_llm.py
+++ b/src/agents/judge_agent_llm.py
@@ -41,5 +41,5 @@ History of previous attempts: {len(history)}
 Evaluate validity and sufficiency."""
 
         run_result = await self.agent.run(prompt)
-        logger.info("LLM judge assessment complete", sufficient=run_result.output.sufficient)
-        return run_result.output  # type: ignore[no-any-return]
+        logger.info("LLM judge assessment complete", sufficient=run_result.data.sufficient)  # type: ignore[attr-defined]
+        return run_result.data  # type: ignore[no-any-return,attr-defined]
diff --git a/src/agents/knowledge_gap.py b/src/agents/knowledge_gap.py
index 2b4b118b885a1d41238a7a63a54997bcc875a0dc..ad3769d1a403998e908b770525230632fe4aebef 100644
--- a/src/agents/knowledge_gap.py
+++ b/src/agents/knowledge_gap.py
@@ -113,7 +113,7 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
         try:
             # Run the agent
             result = await self.agent.run(user_message)
-            evaluation = result.output
+            evaluation = result.data
 
             self.logger.info(
                 "Knowledge gap evaluation complete",
diff --git a/src/agents/long_writer.py b/src/agents/long_writer.py
index 8b03a5263e4dea685bc8e07023444a5525ca6223..9014d2b0903798be07499db42fa5689cb5a889a5 100644
--- a/src/agents/long_writer.py
+++ b/src/agents/long_writer.py
@@ -176,7 +176,7 @@ class LongWriterAgent:
             try:
                 # Run the agent
                 result = await self.agent.run(user_message)
-                output = result.output
+                output = result.data
 
                 # Validate output
                 if not output or not isinstance(output, LongWriterOutput):
diff --git a/src/agents/proofreader.py b/src/agents/proofreader.py
index 72aeaf77881a9e8498ada3cb288b8edf135ddf4b..3d85ce1405376ff2a54178ddf52e7eaabacfab9b 100644
--- a/src/agents/proofreader.py
+++ b/src/agents/proofreader.py
@@ -133,7 +133,7 @@ REPORT DRAFT:
             try:
                 # Run the agent
                 result = await self.agent.run(user_message)
-                final_report = result.output
+                final_report = result.data  # type: ignore[attr-defined]
 
                 # Validate output
                 if not final_report or not final_report.strip():
@@ -142,7 +142,7 @@ REPORT DRAFT:
 
                 self.logger.info("Report proofread", length=len(final_report), attempt=attempt + 1)
 
-                return final_report
+                return final_report  # type: ignore[no-any-return]
 
             except (TimeoutError, ConnectionError) as e:
                 # Transient errors - retry
diff --git a/src/agents/report_agent.py b/src/agents/report_agent.py
index 2d86de86ed646f0f5f7ba870e280b993c072dc40..fbff4d948b94a313ff63cf0169b7fefbe3aad110 100644
--- a/src/agents/report_agent.py
+++ b/src/agents/report_agent.py
@@ -91,7 +91,7 @@ class ReportAgent(BaseAgent):  # type: ignore[misc]
         )
 
         result = await self._get_agent().run(prompt)
-        report = result.output
+        report = result.data  # type: ignore[attr-defined]
 
         # ═══════════════════════════════════════════════════════════════════
         # 🚨 CRITICAL: Validate citations to prevent hallucination
diff --git a/src/agents/thinking.py b/src/agents/thinking.py
index 230c5801fb6bc6822fa155a2b953046a3b5d0729..bf0c84952b14d7e572134758d28ec2dd711a8c2f 100644
--- a/src/agents/thinking.py
+++ b/src/agents/thinking.py
@@ -112,11 +112,11 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
         try:
             # Run the agent
             result = await self.agent.run(user_message)
-            observations = result.output
+            observations = result.data  # type: ignore[attr-defined]
 
             self.logger.info("Observations generated", length=len(observations))
 
-            return observations
+            return observations  # type: ignore[no-any-return]
 
         except Exception as e:
             self.logger.error("Observation generation failed", error=str(e))
diff --git a/src/agents/tool_selector.py b/src/agents/tool_selector.py
index 7137906f762786d228bef1a5691712627ee0ff38..dd3aac43006bbd614115b7a687ee2f84e25b5d79 100644
--- a/src/agents/tool_selector.py
+++ b/src/agents/tool_selector.py
@@ -117,7 +117,7 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
         try:
             # Run the agent
             result = await self.agent.run(user_message)
-            selection_plan = result.output
+            selection_plan = result.data
 
             self.logger.info(
                 "Tool selection complete",
diff --git a/src/agents/writer.py b/src/agents/writer.py
index 73690f15f415bea42b8dfeb3d681b786b50b59a5..418a5105527b7ae5a7d91f53e2aed1ac7a0b83df 100644
--- a/src/agents/writer.py
+++ b/src/agents/writer.py
@@ -136,7 +136,7 @@ FINDINGS:
             try:
                 # Run the agent
                 result = await self.agent.run(user_message)
-                report = result.output
+                report = result.data  # type: ignore[attr-defined]
 
                 # Validate output
                 if not report or not report.strip():
@@ -145,7 +145,7 @@ FINDINGS:
 
                 self.logger.info("Report written", length=len(report), attempt=attempt + 1)
 
-                return report
+                return report  # type: ignore[no-any-return]
 
             except (TimeoutError, ConnectionError) as e:
                 # Transient errors - retry
diff --git a/src/app.py b/src/app.py
index 7275673a06bb7f938288ff5a2b95e9d50927229e..d88d931d1dc17d0a85fc47160d58752f00d5824d 100644
--- a/src/app.py
+++ b/src/app.py
@@ -172,20 +172,29 @@ def event_to_chat_message(event: AgentEvent) -> dict[str, Any]:
             "content": event.message,
         }
 
-    # Build metadata for accordion
+    # Build metadata for accordion according to Gradio ChatMessage spec
+    # Metadata keys: title (str), status ("pending"|"done"), log (str), duration (float)
+    # See: https://www.gradio.app/guides/agents-and-tool-usage
     metadata: dict[str, Any] = {}
+
+    # Title is required for accordion display - must be string
     if config["title"]:
-        metadata["title"] = config["title"]
+        metadata["title"] = str(config["title"])
 
     # Set status (pending shows spinner, done is collapsed)
+    # Must be exactly "pending" or "done" per Gradio spec
     if config["status"] == "pending":
         metadata["status"] = "pending"
+    elif config["status"] == "done":
+        metadata["status"] = "done"
 
-    # Add duration if available in data
+    # Add duration if available in data (must be float)
     if event.data and isinstance(event.data, dict) and "duration" in event.data:
-        metadata["duration"] = event.data["duration"]
+        duration = event.data["duration"]
+        if isinstance(duration, int | float):
+            metadata["duration"] = float(duration)
 
-    # Add log info (iteration number, etc.)
+    # Add log info (iteration number, etc.) - must be string
     log_parts: list[str] = []
     if event.iteration > 0:
         log_parts.append(f"Iteration {event.iteration}")
@@ -198,12 +207,22 @@ def event_to_chat_message(event: AgentEvent) -> dict[str, Any]:
         metadata["log"] = " | ".join(log_parts)
 
     # Return as dict format for Gradio Chatbot compatibility
-    # Gradio Chatbot expects dict format, not gr.ChatMessage objects
+    # According to Gradio docs: https://www.gradio.app/guides/agents-and-tool-usage
+    # ChatMessage format: {"role": "assistant", "content": "...", "metadata": {...}}
+    # Metadata must have "title" key for accordion display
+    # Valid metadata keys: title (str), status ("pending"|"done"), log (str), duration (float)
     result: dict[str, Any] = {
         "role": "assistant",
         "content": event.message,
     }
-    if metadata:
+    # Only add metadata if it has a title (required for accordion display)
+    # Ensure metadata values match Gradio's expected types
+    if metadata and metadata.get("title"):
+        # Ensure status is valid if present
+        if "status" in metadata:
+            status = metadata["status"]
+            if status not in ("pending", "done"):
+                metadata["status"] = "done"  # Default to "done" if invalid
         result["metadata"] = metadata
     return result
 
@@ -455,10 +474,11 @@ async def research_agent(
             yield msg
 
     except Exception as e:
+        # Return error message without metadata to avoid issues during example caching
+        # Metadata can cause validation errors when Gradio caches examples
         yield {
             "role": "assistant",
-            "content": f"❌ **Error**: {e!s}",
-            "metadata": {"title": "❌ Error", "status": "done"},
+            "content": f"❌ **Error**: {e!s}\n\n*Please check your configuration and try again.*",
         }
 
 
@@ -681,9 +701,21 @@ def create_demo() -> gr.Blocks:
                 "**Sign in with HuggingFace** above to access premium models and providers."
             ),
             examples=[
-                ["What drugs could be repurposed for Alzheimer's disease?", "simple"],
-                ["Is metformin effective for treating cancer?", "simple"],
-                ["What medications show promise for Long COVID treatment?", "simple"],
+                # When additional_inputs are provided, examples must be lists of lists
+                # Each inner list: [message, mode, hf_model, hf_provider]
+                [
+                    "What drugs could be repurposed for Alzheimer's disease?",
+                    "iterative",
+                    None,
+                    None,
+                ],
+                ["Is metformin effective for treating cancer?", "iterative", None, None],
+                [
+                    "What medications show promise for Long COVID treatment?",
+                    "iterative",
+                    None,
+                    None,
+                ],
             ],
             additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False),
             additional_inputs=[
diff --git a/src/orchestrator/planner_agent.py b/src/orchestrator/planner_agent.py
index ea560afab9eab23c8fbe22bcd551b63a9b0f4398..110c1bade21c99a72f1c2b3b6d9cb96c2a2b89a5 100644
--- a/src/orchestrator/planner_agent.py
+++ b/src/orchestrator/planner_agent.py
@@ -109,7 +109,7 @@ class PlannerAgent:
         try:
             # Run the agent
             result = await self.agent.run(user_message)
-            report_plan = result.output
+            report_plan = result.data
 
             # Validate report plan
             if not report_plan.report_outline:
diff --git a/src/services/llamaindex_rag.py b/src/services/llamaindex_rag.py
index 322a6bc4afaa6534431daa96814ef48ee4731f31..00a6da967e191f1f4b2d0dfc4a29fb82f71feed4 100644
--- a/src/services/llamaindex_rag.py
+++ b/src/services/llamaindex_rag.py
@@ -136,7 +136,8 @@ class LlamaIndexRAGService:
             }
         except ImportError as e:
             raise ImportError(
-                "LlamaIndex dependencies not installed. Run: uv sync --extra modal"
+                "LlamaIndex dependencies not installed. Required packages: chromadb, llama-index, "
+                "and their dependencies. If rpds is missing, try: uv pip install rpds-py"
             ) from e
 
     def _configure_embeddings(
diff --git a/src/services/statistical_analyzer.py b/src/services/statistical_analyzer.py
index d43cfa1056ca2248807d54ae50dc7e4dcb025118..09d446c3ba7ebac75c97b6ac8cfa5f606998ea51 100644
--- a/src/services/statistical_analyzer.py
+++ b/src/services/statistical_analyzer.py
@@ -135,7 +135,7 @@ Generate executable Python code to analyze this evidence."""
             # Generate code
             agent = self._get_agent()
             code_result = await agent.run(prompt)
-            generated_code = code_result.output
+            generated_code = code_result.data  # type: ignore[attr-defined]
 
             # Execute in Modal sandbox
             loop = asyncio.get_running_loop()
diff --git a/tests/integration/test_rag_integration.py b/tests/integration/test_rag_integration.py
index 38d3f6ec09900ec3645af00bb181054577c6ce51..6a525bb1ab565c096a04d63316ae393c8ea19421 100644
--- a/tests/integration/test_rag_integration.py
+++ b/tests/integration/test_rag_integration.py
@@ -121,9 +121,12 @@ class TestRAGServiceIntegration:
             assert len(response) > 0
             assert "python" in response.lower()
         except Exception as e:
-            # If model is not available (404), skip the test
-            if "404" in str(e) or "Not Found" in str(e):
+            # If model is not available (404) or authentication required (401), skip the test
+            error_str = str(e)
+            if "404" in error_str or "Not Found" in error_str:
                 pytest.skip(f"HuggingFace model not available via inference API: {e}")
+            if "401" in error_str or "Unauthorized" in error_str or "Invalid username or password" in error_str:
+                pytest.skip(f"HuggingFace authentication required but not available: {e}")
             raise
 
         # Cleanup
diff --git a/tests/unit/agent_factory/test_judges.py b/tests/unit/agent_factory/test_judges.py
index c2075cdaa3b0d103d5a6b5f5fedb4c0c876356ce..342aa68997467d1fe88d1d31e579ff848d2d3528 100644
--- a/tests/unit/agent_factory/test_judges.py
+++ b/tests/unit/agent_factory/test_judges.py
@@ -34,6 +34,7 @@ class TestJudgeHandler:
 
         # Mock the PydanticAI agent
         mock_result = MagicMock()
+        type(mock_result).data = mock_assessment  # pydantic-ai uses .data for structured output
         mock_result.output = mock_assessment
 
         with (
@@ -88,7 +89,8 @@ class TestJudgeHandler:
         )
 
         mock_result = MagicMock()
-        mock_result.output = mock_assessment
+        mock_result.data = mock_assessment
+        mock_result.output = mock_assessment  # Some code may use .output
 
         with (
             patch("src.agent_factory.judges.get_model") as mock_get_model,
diff --git a/tests/unit/agents/test_hypothesis_agent.py b/tests/unit/agents/test_hypothesis_agent.py
index be9b8768b5fed19198ee3d721fff50dd9de8c44c..69772bb1397600b170891a9629499f3aba41bd89 100644
--- a/tests/unit/agents/test_hypothesis_agent.py
+++ b/tests/unit/agents/test_hypothesis_agent.py
@@ -28,18 +28,17 @@ def sample_evidence():
 
 @pytest.fixture
 def mock_assessment():
+    primary_hyp = MechanismHypothesis(
+        drug="Metformin",
+        target="AMPK",
+        pathway="mTOR inhibition",
+        effect="Reduced cancer cell proliferation",
+        confidence=0.75,
+        search_suggestions=["metformin AMPK cancer", "mTOR cancer therapy"],
+    )
     return HypothesisAssessment(
-        hypotheses=[
-            MechanismHypothesis(
-                drug="Metformin",
-                target="AMPK",
-                pathway="mTOR inhibition",
-                effect="Reduced cancer cell proliferation",
-                confidence=0.75,
-                search_suggestions=["metformin AMPK cancer", "mTOR cancer therapy"],
-            )
-        ],
-        primary_hypothesis=None,
+        hypotheses=[primary_hyp],
+        primary_hypothesis=primary_hyp,  # Set primary hypothesis
         knowledge_gaps=["Clinical trial data needed"],
         recommended_searches=["metformin clinical trial cancer"],
     )
@@ -54,8 +53,9 @@ async def test_hypothesis_agent_generates_hypotheses(sample_evidence, mock_asses
         with patch("src.agents.hypothesis_agent.Agent") as mock_agent_class:
             mock_get_model.return_value = MagicMock()  # Mock model
             mock_result = MagicMock()
+            type(mock_result).data = mock_assessment  # pydantic-ai uses .data for structured output
             mock_result.output = mock_assessment
-            # pydantic-ai Agent returns an object with .output for structured output
+            # pydantic-ai Agent returns an object with .data for structured output
             mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
 
             agent = HypothesisAgent(store)
@@ -94,6 +94,7 @@ async def test_hypothesis_agent_uses_embeddings(sample_evidence, mock_assessment
                 mock_format.return_value = "Prompt"
 
                 mock_result = MagicMock()
+                type(mock_result).data = mock_assessment  # pydantic-ai uses .data for structured output
                 mock_result.output = mock_assessment
                 mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
 
diff --git a/tests/unit/agents/test_input_parser.py b/tests/unit/agents/test_input_parser.py
index fd4f4a240c4c2387bad38b079952f2c5af04a35c..ea2736a9519875e4905399bbbb33659aafe3674f 100644
--- a/tests/unit/agents/test_input_parser.py
+++ b/tests/unit/agents/test_input_parser.py
@@ -18,6 +18,13 @@ def mock_model() -> MagicMock:
     return model
 
 
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+    """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+    with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+        yield
+
+
 @pytest.fixture
 def mock_parsed_query_iterative() -> ParsedQuery:
     """Create a mock ParsedQuery for iterative mode."""
@@ -51,7 +58,9 @@ def mock_agent_result_iterative(
     mock_parsed_query_iterative: ParsedQuery,
 ) -> RunResult[ParsedQuery]:
     """Create a mock agent result for iterative mode."""
-    result = MagicMock(spec=RunResult)
+    result = MagicMock()
+    # Configure the mock to return the actual output when .data is accessed
+    type(result).data = mock_parsed_query_iterative
     result.output = mock_parsed_query_iterative
     return result
 
@@ -61,7 +70,9 @@ def mock_agent_result_deep(
     mock_parsed_query_deep: ParsedQuery,
 ) -> RunResult[ParsedQuery]:
     """Create a mock agent result for deep mode."""
-    result = MagicMock(spec=RunResult)
+    result = MagicMock()
+    # Configure the mock to return the actual output when .data is accessed
+    type(result).data = mock_parsed_query_deep
     result.output = mock_parsed_query_deep
     return result
 
@@ -72,33 +83,52 @@ def input_parser_agent(mock_model: MagicMock) -> InputParserAgent:
     return InputParserAgent(model=mock_model)
 
 
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+    """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+    with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+        yield
+
+
 class TestInputParserAgentInit:
     """Test InputParserAgent initialization."""
 
-    def test_input_parser_agent_init_with_model(self, mock_model: MagicMock) -> None:
+    @patch("pydantic_ai.models.infer_model")
+    def test_input_parser_agent_init_with_model(
+        self, mock_infer_model: MagicMock, mock_model: MagicMock
+    ) -> None:
         """Test InputParserAgent initialization with provided model."""
+        mock_infer_model.return_value = mock_model
         agent = InputParserAgent(model=mock_model)
         assert agent.model == mock_model
         assert agent.agent is not None
 
     @patch("src.agents.input_parser.get_model")
+    @patch("pydantic_ai.models.infer_model")
     def test_input_parser_agent_init_without_model(
-        self, mock_get_model: MagicMock, mock_model: MagicMock
+        self,
+        mock_infer_model: MagicMock,
+        mock_get_model: MagicMock,
+        mock_model: MagicMock,
     ) -> None:
         """Test InputParserAgent initialization without model (uses default)."""
         mock_get_model.return_value = mock_model
+        mock_infer_model.return_value = mock_model
         agent = InputParserAgent()
         assert agent.model == mock_model
         mock_get_model.assert_called_once()
 
+    @patch("pydantic_ai.models.infer_model")
     def test_input_parser_agent_has_correct_system_prompt(
-        self, input_parser_agent: InputParserAgent
+        self, mock_infer_model: MagicMock, mock_model: MagicMock
     ) -> None:
         """Test that InputParserAgent has correct system prompt."""
+        mock_infer_model.return_value = mock_model
+        agent = InputParserAgent(model=mock_model)
         # System prompt should contain key instructions
         # In pydantic_ai, system_prompt is a property that returns the prompt string
         # For mocked agents, we check that the agent was created with a system prompt
-        assert input_parser_agent.agent is not None
+        assert agent.agent is not None
         # The actual system prompt is set during agent creation
         # We verify the agent exists and was properly initialized
         # Note: Direct access to system_prompt may not work with mocks
diff --git a/tests/unit/agents/test_long_writer.py b/tests/unit/agents/test_long_writer.py
index 771c27e66b4fa8618cd0a80cdb7d63fdf6d447c7..a2d4bee0c5b62a2c82ef6fd970f4b76b7b676938 100644
--- a/tests/unit/agents/test_long_writer.py
+++ b/tests/unit/agents/test_long_writer.py
@@ -17,6 +17,13 @@ def mock_model() -> MagicMock:
     return model
 
 
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+    """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+    with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+        yield
+
+
 @pytest.fixture
 def mock_long_writer_output() -> LongWriterOutput:
     """Create a mock LongWriterOutput."""
@@ -31,7 +38,9 @@ def mock_agent_result(
     mock_long_writer_output: LongWriterOutput,
 ) -> RunResult[LongWriterOutput]:
     """Create a mock agent result."""
-    result = MagicMock(spec=RunResult)
+    result = MagicMock()
+    # Configure the mock to return the actual output when .data is accessed
+    type(result).data = mock_long_writer_output
     result.output = mock_long_writer_output
     return result
 
@@ -340,9 +349,11 @@ class TestWriteReport:
             references=["[1] https://example.com/2"],
         )
 
-        result1 = MagicMock(spec=RunResult)
+        result1 = MagicMock()
+        type(result1).data = output1  # pydantic-ai uses .data for structured output
         result1.output = output1
-        result2 = MagicMock(spec=RunResult)
+        result2 = MagicMock()
+        type(result2).data = output2  # pydantic-ai uses .data for structured output
         result2.output = output2
         results = [result1, result2]
         long_writer_agent.agent.run = AsyncMock(side_effect=results)
diff --git a/tests/unit/agents/test_proofreader.py b/tests/unit/agents/test_proofreader.py
index bb21aa50c6c04981ad6ecd55462ba87a9a675cf3..18eab15d3137732f8a9409b681ceb917d5886917 100644
--- a/tests/unit/agents/test_proofreader.py
+++ b/tests/unit/agents/test_proofreader.py
@@ -18,6 +18,13 @@ def mock_model() -> MagicMock:
     return model
 
 
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+    """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+    with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+        yield
+
+
 @pytest.fixture
 def mock_agent_result() -> RunResult[Any]:
     """Create a mock agent result."""
diff --git a/tests/unit/agents/test_report_agent.py b/tests/unit/agents/test_report_agent.py
index 4121dd22c7f389e661dd8d4aa2f85eac5a8b33c5..426519371021df25461219ba8a99559805bb3627 100644
--- a/tests/unit/agents/test_report_agent.py
+++ b/tests/unit/agents/test_report_agent.py
@@ -102,6 +102,7 @@ async def test_report_agent_generates_report(
     ):
         mock_get_model.return_value = MagicMock()
         mock_result = MagicMock()
+        type(mock_result).data = mock_report  # pydantic-ai uses .data for structured output
         mock_result.output = mock_report
         mock_agent_class.return_value.run = AsyncMock(return_value=mock_result)
 
diff --git a/tests/unit/agents/test_writer.py b/tests/unit/agents/test_writer.py
index 752ca923c294acc6649cd50a5e788ccc856f12c4..975f7a19bad715319ed1058746454d950f338aeb 100644
--- a/tests/unit/agents/test_writer.py
+++ b/tests/unit/agents/test_writer.py
@@ -18,6 +18,13 @@ def mock_model() -> MagicMock:
     return model
 
 
+@pytest.fixture(autouse=True)
+def patch_infer_model(mock_model: MagicMock):
+    """Auto-patch infer_model for all tests to avoid OpenAI API key requirements."""
+    with patch("pydantic_ai.models.infer_model", return_value=mock_model):
+        yield
+
+
 @pytest.fixture
 def mock_agent_result() -> RunResult[Any]:
     """Create a mock agent result."""
diff --git a/tests/unit/middleware/__init__.py b/tests/unit/middleware/__init__.py
index 6471c91c5166c60c60a919b6cdf145781acfca7d..65aa34a2f83ea7450a83f3ed81fd3fa659075b85 100644
--- a/tests/unit/middleware/__init__.py
+++ b/tests/unit/middleware/__init__.py
@@ -7,3 +7,5 @@
 
 
 
+
+
diff --git a/tests/unit/middleware/test_budget_tracker_phase7.py b/tests/unit/middleware/test_budget_tracker_phase7.py
index 8d881e807fd1aec37400f2f6f0244d8fff7efab2..1821ace8788cf1c2e6ec3410d37a072823a676bb 100644
--- a/tests/unit/middleware/test_budget_tracker_phase7.py
+++ b/tests/unit/middleware/test_budget_tracker_phase7.py
@@ -165,3 +165,5 @@ class TestIterationTokenTracking:
 
 
 
+
+
diff --git a/tests/unit/middleware/test_state_machine.py b/tests/unit/middleware/test_state_machine.py
index 730efc03904c9fbaffe584c041abeafec0eda1e3..b014e65fc2134be57bdbb0dbbc50a7d3da392046 100644
--- a/tests/unit/middleware/test_state_machine.py
+++ b/tests/unit/middleware/test_state_machine.py
@@ -362,3 +362,5 @@ class TestContextVarIsolation:
 
 
 
+
+
diff --git a/tests/unit/middleware/test_workflow_manager.py b/tests/unit/middleware/test_workflow_manager.py
index ebfef154f343e86ce748c301d6a795c2d17e8bd8..8df1c7af664918acd8b9abb6630cf69cf4aac966 100644
--- a/tests/unit/middleware/test_workflow_manager.py
+++ b/tests/unit/middleware/test_workflow_manager.py
@@ -292,3 +292,5 @@ class TestWorkflowManager:
 
 
 
+
+
diff --git a/tests/unit/orchestrator/__init__.py b/tests/unit/orchestrator/__init__.py
index ea3c8051b5f54fa058738b93afd3c2268613d964..f4189d35b5a34adaedb86cd55366a755e2b421ff 100644
--- a/tests/unit/orchestrator/__init__.py
+++ b/tests/unit/orchestrator/__init__.py
@@ -7,3 +7,5 @@
 
 
 
+
+
diff --git a/tests/unit/orchestrator/test_graph_orchestrator.py b/tests/unit/orchestrator/test_graph_orchestrator.py
index 4136663f577f8a52cdbc22b888f8719322d8547e..3aa33202203294d53d986d6e9aa3b4d548bcee3c 100644
--- a/tests/unit/orchestrator/test_graph_orchestrator.py
+++ b/tests/unit/orchestrator/test_graph_orchestrator.py
@@ -209,10 +209,12 @@ class TestGraphOrchestrator:
         from src.orchestrator.research_flow import IterativeResearchFlow
 
         # Create flow and patch its run method to raise exception
-        original_flow = IterativeResearchFlow(
-            max_iterations=2,
-            max_time_minutes=5,
-        )
+        mock_judge = MagicMock()
+        with patch("src.orchestrator.research_flow.create_judge_handler", return_value=mock_judge):
+            original_flow = IterativeResearchFlow(
+                max_iterations=2,
+                max_time_minutes=5,
+            )
         orchestrator._iterative_flow = original_flow
 
         with patch.object(original_flow, "run", side_effect=Exception("Test error")):
diff --git a/tests/unit/orchestrator/test_planner_agent.py b/tests/unit/orchestrator/test_planner_agent.py
index 9d479e7a4af0bf3866b500d6468e07897d275fb0..00142f885da873abfa5ae1741347a683c2612d3b 100644
--- a/tests/unit/orchestrator/test_planner_agent.py
+++ b/tests/unit/orchestrator/test_planner_agent.py
@@ -20,7 +20,7 @@ class TestPlannerAgent:
     def mock_agent_run_result(self):
         """Create a mock agent run result."""
         mock_result = MagicMock()
-        mock_result.output = ReportPlan(
+        report_plan = ReportPlan(
             background_context="Python is a programming language.",
             report_outline=[
                 ReportPlanSection(
@@ -34,6 +34,8 @@ class TestPlannerAgent:
             ],
             report_title="Python Programming Language Overview",
         )
+        type(mock_result).data = report_plan  # pydantic-ai uses .data for structured output
+        mock_result.output = report_plan
         return mock_result
 
     @pytest.mark.asyncio
@@ -63,11 +65,13 @@ class TestPlannerAgent:
     async def test_planner_agent_handles_empty_outline(self, mock_model):
         """PlannerAgent should return fallback plan when outline is empty."""
         mock_result = MagicMock()
-        mock_result.output = ReportPlan(
+        report_plan = ReportPlan(
             background_context="Some context",
             report_outline=[],  # Empty outline
             report_title="Test Report",
         )
+        type(mock_result).data = report_plan  # pydantic-ai uses .data for structured output
+        mock_result.output = report_plan
 
         mock_agent = AsyncMock()
         mock_agent.run = AsyncMock(return_value=mock_result)
diff --git a/tests/unit/orchestrator/test_research_flow.py b/tests/unit/orchestrator/test_research_flow.py
index c9a8f407ec2027feadddc0df68d615197001b3b4..1d5a11d6b7a06728b3dca1afe451ef4008f78196 100644
--- a/tests/unit/orchestrator/test_research_flow.py
+++ b/tests/unit/orchestrator/test_research_flow.py
@@ -1,6 +1,6 @@
 """Unit tests for ResearchFlow classes."""
 
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 
@@ -31,6 +31,27 @@ class TestIterativeResearchFlow:
     @pytest.fixture
     def flow(self, mock_agents):
         """Create an IterativeResearchFlow with mocked agents."""
+        from src.utils.models import JudgeAssessment, AssessmentDetails
+        
+        mock_judge = MagicMock()
+        # Mock judge assessment - default to insufficient so loops continue
+        default_assessment = JudgeAssessment(
+            details=AssessmentDetails(
+                mechanism_score=5,
+                mechanism_reasoning="Test reasoning for mechanism assessment",
+                clinical_evidence_score=5,
+                clinical_reasoning="Test reasoning for clinical evidence assessment",
+                drug_candidates=[],
+                key_findings=[],
+            ),
+            sufficient=False,
+            confidence=0.5,
+            recommendation="continue",
+            next_search_queries=[],
+            reasoning="Test assessment for research flow testing purposes",
+        )
+        mock_judge.assess = AsyncMock(return_value=default_assessment)
+        
         with (
             patch("src.orchestrator.research_flow.create_knowledge_gap_agent") as mock_kg,
             patch("src.orchestrator.research_flow.create_tool_selector_agent") as mock_ts,
@@ -38,14 +59,18 @@ class TestIterativeResearchFlow:
             patch("src.orchestrator.research_flow.create_writer_agent") as mock_writer,
             patch("src.orchestrator.research_flow.execute_tool_tasks") as mock_execute,
             patch("src.orchestrator.research_flow.get_rag_service") as mock_rag,
+            patch("src.orchestrator.research_flow.create_judge_handler", return_value=mock_judge),
         ):
             mock_kg.return_value = mock_agents["knowledge_gap"]
             mock_ts.return_value = mock_agents["tool_selector"]
             mock_thinking.return_value = mock_agents["thinking"]
             mock_writer.return_value = mock_agents["writer"]
-            mock_execute.return_value = {
-                "task_1": ToolAgentOutput(output="Finding 1", sources=["url1"]),
-            }
+            # execute_tool_tasks is async, so make the mock async
+            async def mock_execute_async(*args, **kwargs):
+                return {
+                    "task_1": ToolAgentOutput(output="Finding 1", sources=["url1"]),
+                }
+            mock_execute.side_effect = mock_execute_async
             # Mock RAG service to return None to avoid ChromaDB initialization
             mock_rag.return_value = None
 
@@ -54,6 +79,26 @@ class TestIterativeResearchFlow:
     @pytest.mark.asyncio
     async def test_iterative_flow_completes_when_research_complete(self, flow, mock_agents):
         """IterativeResearchFlow should complete when research is marked complete."""
+        from src.utils.models import JudgeAssessment, AssessmentDetails
+        
+        # Mock judge to return sufficient=True so loop completes
+        sufficient_assessment = JudgeAssessment(
+            details=AssessmentDetails(
+                mechanism_score=8,
+                mechanism_reasoning="Strong evidence for mechanism of action",
+                clinical_evidence_score=7,
+                clinical_reasoning="Good support from clinical studies",
+                drug_candidates=["TestDrug"],
+                key_findings=["Finding 1"],
+            ),
+            sufficient=True,
+            confidence=0.9,
+            recommendation="synthesize",
+            next_search_queries=[],
+            reasoning="Evidence is sufficient",
+        )
+        flow.judge_handler.assess = AsyncMock(return_value=sufficient_assessment)
+        
         # Mock knowledge gap agent to return complete
         mock_agents["knowledge_gap"].evaluate = AsyncMock(
             return_value=KnowledgeGapOutput(
@@ -202,10 +247,32 @@ class TestDeepResearchFlow:
     @pytest.fixture
     def flow(self, mock_agents):
         """Create a DeepResearchFlow with mocked agents."""
+        from src.utils.models import JudgeAssessment, AssessmentDetails
+        
+        mock_judge = MagicMock()
+        # Mock judge assessment - default to insufficient so loops continue
+        default_assessment = JudgeAssessment(
+            details=AssessmentDetails(
+                mechanism_score=5,
+                mechanism_reasoning="Test reasoning for mechanism assessment",
+                clinical_evidence_score=5,
+                clinical_reasoning="Test reasoning for clinical evidence assessment",
+                drug_candidates=[],
+                key_findings=[],
+            ),
+            sufficient=False,
+            confidence=0.5,
+            recommendation="continue",
+            next_search_queries=[],
+            reasoning="Test assessment for research flow testing purposes",
+        )
+        mock_judge.assess = AsyncMock(return_value=default_assessment)
+        
         with (
             patch("src.orchestrator.research_flow.create_planner_agent") as mock_planner,
             patch("src.orchestrator.research_flow.create_long_writer_agent") as mock_long_writer,
             patch("src.orchestrator.research_flow.create_proofreader_agent") as mock_proofreader,
+            patch("src.orchestrator.research_flow.create_judge_handler", return_value=mock_judge),
         ):
             mock_planner.return_value = mock_agents["planner"]
             mock_long_writer.return_value = mock_agents["long_writer"]
diff --git a/tests/unit/services/test_statistical_analyzer.py b/tests/unit/services/test_statistical_analyzer.py
index d5b2e39aad7c8e29a3f72d9d8b90c53e7294b4cd..978397530a5948e0c1faeeb31069374424471506 100644
--- a/tests/unit/services/test_statistical_analyzer.py
+++ b/tests/unit/services/test_statistical_analyzer.py
@@ -54,9 +54,10 @@ class TestStatisticalAnalyzer:
             patch.object(analyzer, "_get_code_executor") as mock_executor,
         ):
             # Mock LLM
-            mock_agent.return_value.run = AsyncMock(
-                return_value=MagicMock(output="print('SUPPORTED')")
-            )
+            mock_code_result = MagicMock()
+            type(mock_code_result).data = "print('SUPPORTED')"  # pydantic-ai uses .data
+            mock_code_result.output = "print('SUPPORTED')"
+            mock_agent.return_value.run = AsyncMock(return_value=mock_code_result)
 
             # Mock Modal
             mock_executor.return_value.execute.return_value = {
diff --git a/tests/unit/test_app_smoke.py b/tests/unit/test_app_smoke.py
index 74e88245814f12c1d80af1975ddf25b5b0dd634f..22fbed5f9ecb80ca7d55ea75a5aad3d2c25a3ee9 100644
--- a/tests/unit/test_app_smoke.py
+++ b/tests/unit/test_app_smoke.py
@@ -28,6 +28,11 @@ class TestAppSmoke:
 
         # OAuth dependencies may not be available in test environment
         # This is acceptable - OAuth is optional functionality
+        # Also skip if HF_TOKEN is not set (required for Gradio OAuth mocking)
+        import os
+        if not os.getenv("HF_TOKEN"):
+            pytest.skip("HF_TOKEN not set - required for Gradio OAuth mocking in tests")
+        
         try:
             demo = create_demo()
             assert demo is not None
@@ -35,6 +40,10 @@ class TestAppSmoke:
             if "oauth" in str(e).lower() or "itsdangerous" in str(e).lower():
                 pytest.skip(f"OAuth dependencies not available: {e}")
             raise
+        except ValueError as e:
+            if "HF_TOKEN" in str(e) or "huggingface-cli login" in str(e):
+                pytest.skip(f"HF authentication not available: {e}")
+            raise
 
     def test_mcp_tools_importable(self) -> None:
         """MCP tool functions should be importable.
diff --git a/uv.lock b/uv.lock
index 8d086d578d5116973daac51e7abfc7f81162796d..797f9971f239a55a0d02a3ccf246dbb9f226f2da 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1032,6 +1032,7 @@ dependencies = [
     { name = "pytest-cov" },
     { name = "python-dotenv" },
     { name = "requests" },
+    { name = "rpds-py" },
     { name = "sentence-transformers" },
     { name = "structlog" },
     { name = "tenacity" },
@@ -1105,6 +1106,7 @@ requires-dist = [
     { name = "python-dotenv", specifier = ">=1.0" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "respx", marker = "extra == 'dev'", specifier = ">=0.22.0" },
+    { name = "rpds-py", specifier = ">=0.29.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.14.6" },
     { name = "sentence-transformers", specifier = ">=2.2.0" },
     { name = "structlog", specifier = ">=24.1" },