Spaces:

DataQuests
/

DeepCritical

Running

App Files Files Community

Joseph Pollack commited on 10 days ago

Commit

ce644a9

unverified ·

1 Parent(s): ca3a4f7

adds or improves : interface , tests, docs , ci , precommit , build , and demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env copy.example +124 -0
.github/README.md +3 -145
.github/workflows/ci.yml +14 -7
.github/workflows/docs.yml +6 -3
.gitignore +2 -0
.pre-commit-hooks/run_pytest_with_sync.py +109 -3
README.md +3 -2
docs/api/agents.md +3 -0
docs/api/models.md +3 -0
docs/api/orchestrators.md +3 -0
docs/api/services.md +3 -0
docs/api/tools.md +3 -0
docs/architecture/agents.md +3 -0
docs/architecture/middleware.md +3 -0
docs/architecture/services.md +3 -0
docs/architecture/tools.md +3 -0
docs/contributing/code-quality.md +3 -0
docs/contributing/code-style.md +3 -0
docs/contributing/error-handling.md +3 -0
docs/contributing/implementation-patterns.md +3 -0
docs/contributing/index.md +3 -0
docs/contributing/prompt-engineering.md +3 -0
docs/contributing/testing.md +3 -0
docs/getting-started/examples.md +3 -0
docs/getting-started/installation.md +3 -0
docs/getting-started/mcp-integration.md +3 -0
docs/getting-started/quick-start.md +3 -0
docs/license.md +3 -0
docs/overview/architecture.md +3 -0
docs/overview/features.md +3 -0
docs/team.md +3 -0
mkdocs.yml +2 -2
pyproject.toml +50 -41
requirements.txt +1 -1
src/agent_factory/judges.py +123 -37
src/agents/hypothesis_agent.py +2 -2
src/agents/input_parser.py +3 -3
src/agents/judge_agent_llm.py +3 -3
src/agents/knowledge_gap.py +3 -3
src/agents/long_writer.py +3 -3
src/agents/report_agent.py +2 -2
src/agents/tool_selector.py +3 -3
src/app.py +223 -18
src/legacy_orchestrator.py +17 -3
src/orchestrator/planner_agent.py +3 -3
src/services/llamaindex_rag.py +1 -1
src/services/statistical_analyzer.py +2 -2
src/utils/config.py +12 -4
src/utils/huggingface_chat_client.py +2 -2
src/utils/inference_models.py +627 -0

.env copy.example ADDED Viewed

	@@ -0,0 +1,124 @@

+# ============== LLM CONFIGURATION ==============
+# Provider: "openai", "anthropic", or "huggingface"
+LLM_PROVIDER=openai
+# API Keys (at least one required for full LLM analysis)
+OPENAI_API_KEY=sk-your-key-here
+ANTHROPIC_API_KEY=sk-ant-your-key-here
+# Model names (optional - sensible defaults set in config.py)
+# OPENAI_MODEL=gpt-5.1
+# ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
+# ============== HUGGINGFACE CONFIGURATION ==============
+# HuggingFace Token - enables gated models and higher rate limits
+# Get yours at: https://huggingface.co/settings/tokens
+#
+# WITHOUT HF_TOKEN: Falls back to ungated models (zephyr-7b-beta, Qwen2-7B)
+# WITH HF_TOKEN: Uses gated models (Llama 3.1, Gemma-2) via inference providers
+#
+# For HuggingFace Spaces deployment:
+#   Set this as a "Secret" in Space Settings -> Variables and secrets
+#   Users/judges don't need their own token - the Space secret is used
+#
+HF_TOKEN=hf_your-token-here
+# Alternative: HUGGINGFACE_API_KEY (same as HF_TOKEN)
+# Default HuggingFace model for inference (gated, requires auth)
+# Can be overridden in UI dropdown
+# Latest reasoning models: Qwen3-Next-80B-A3B-Thinking, Qwen3-Next-80B-A3B-Instruct, Llama-3.3-70B-Instruct
+HUGGINGFACE_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
+# Fallback models for HuggingFace Inference API (comma-separated)
+# Models are tried in order until one succeeds
+# Format: model1,model2,model3
+# Latest reasoning models first, then reliable fallbacks
+# Reasoning models: Qwen3-Next (thinking/instruct), Llama-3.3-70B, Qwen3-235B
+# Fallbacks: Llama-3.1-8B, Zephyr-7B (ungated), Qwen2-7B (ungated)
+HF_FALLBACK_MODELS=Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen/Qwen3-Next-80B-A3B-Instruct,meta-llama/Llama-3.3-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,HuggingFaceH4/zephyr-7b-beta,Qwen/Qwen2-7B-Instruct
+# Override model/provider selection (optional, usually set via UI)
+# HF_MODEL=Qwen/Qwen3-Next-80B-A3B-Thinking
+# HF_PROVIDER=hyperbolic
+# ============== EMBEDDING CONFIGURATION ==============
+# Embedding Provider: "openai", "local", or "huggingface"
+# Default: "local" (no API key required)
+EMBEDDING_PROVIDER=local
+# OpenAI Embedding Model (used if EMBEDDING_PROVIDER=openai)
+OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+# Local Embedding Model (sentence-transformers, used if EMBEDDING_PROVIDER=local)
+# BAAI/bge-small-en-v1.5 is newer, faster, and better than all-MiniLM-L6-v2
+LOCAL_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5
+# HuggingFace Embedding Model (used if EMBEDDING_PROVIDER=huggingface)
+HUGGINGFACE_EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# ============== AGENT CONFIGURATION ==============
+MAX_ITERATIONS=10
+SEARCH_TIMEOUT=30
+LOG_LEVEL=INFO
+# Graph-based execution (experimental)
+# USE_GRAPH_EXECUTION=false
+# Budget & Rate Limiting
+# DEFAULT_TOKEN_LIMIT=100000
+# DEFAULT_TIME_LIMIT_MINUTES=10
+# DEFAULT_ITERATIONS_LIMIT=10
+# ============== WEB SEARCH CONFIGURATION ==============
+# Web Search Provider: "serper", "searchxng", "brave", "tavily", or "duckduckgo"
+# Default: "duckduckgo" (no API key required)
+WEB_SEARCH_PROVIDER=duckduckgo
+# Serper API Key (for Google search via Serper)
+# SERPER_API_KEY=your-serper-key-here
+# SearchXNG Host URL (for self-hosted search)
+# SEARCHXNG_HOST=http://localhost:8080
+# Brave Search API Key
+# BRAVE_API_KEY=your-brave-key-here
+# Tavily API Key
+# TAVILY_API_KEY=your-tavily-key-here
+# ============== EXTERNAL SERVICES ==============
+# PubMed (optional - higher rate limits: 10 req/sec vs 3 req/sec)
+NCBI_API_KEY=your-ncbi-key-here
+# Modal (optional - for secure code execution sandbox)
+# MODAL_TOKEN_ID=your-modal-token-id
+# MODAL_TOKEN_SECRET=your-modal-token-secret
+# ============== VECTOR DATABASE (ChromaDB) ==============
+# ChromaDB storage path
+CHROMA_DB_PATH=./chroma_db
+# Persist ChromaDB to disk (default: true)
+# CHROMA_DB_PERSIST=true
+# Remote ChromaDB server (optional)
+# CHROMA_DB_HOST=localhost
+# CHROMA_DB_PORT=8000
+# ============== RAG SERVICE CONFIGURATION ==============
+# ChromaDB collection name for RAG
+# RAG_COLLECTION_NAME=deepcritical_evidence
+# Number of top results to retrieve from RAG
+# RAG_SIMILARITY_TOP_K=5
+# Automatically ingest evidence into RAG
+# RAG_AUTO_INGEST=true

.github/README.md CHANGED Viewed

@@ -28,29 +28,11 @@ tags:
 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
 [![Documentation](https://img.shields.io/badge/📚%20Docs-0080FF?style=for-the-badge&logo=readthedocs&logoColor=white&labelColor=0080FF&color=0080FF)](docs/index.md)
 [![Demo](https://img.shields.io/badge/🚀%20Demo-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white&labelColor=FFD21E&color=FFD21E)](https://huggingface.co/spaces/DataQuests/DeepCritical)
-[![CodeCov](https://img.shields.io/badge/📊%20Coverage-F01F7A?style=for-the-badge&logo=codecov&logoColor=white&labelColor=F01F7A&color=F01F7A)](https://codecov.io/gh/DeepCritical/GradioDemo)
 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)
 </div>
-# DeepCritical
-## Intro
-## Features
-- **Multi-Source Search**: PubMed, ClinicalTrials.gov, bioRxiv/medRxiv
-- **MCP Integration**: Use our tools from Claude Desktop or any MCP client
-- **HuggingFace OAuth**: Sign in with your HuggingFace account to automatically use your API token
-- **Modal Sandbox**: Secure execution of AI-generated statistical code
-- **LlamaIndex RAG**: Semantic search and evidence synthesis
-- **HuggingfaceInference**: Free tier support with automatic fallback
-- **HuggingfaceMCP Custom Config To Use Community Tools**:
-- **Strongly Typed Composable Graphs**:
-- **Specialized Research Teams of Agents**:
 ## Quick Start
 ### 1. Environment Setup
@@ -60,14 +42,14 @@ tags:
 pip install uv
 # Sync dependencies
-uv sync
 ```
 ### 2. Run the UI
 ```bash
 # Start the Gradio app
-uv run gradio run src/app.py
 ```
 Open your browser to `http://localhost:7860`.
@@ -80,11 +62,6 @@ Open your browser to `http://localhost:7860`.
 - No need to manually enter API keys when logged in
 - OAuth token is used only for the current session and never stored
-**Manual API Key (BYOK)**:
-- You can still provide your own API key in the Settings accordion
-- Supports HuggingFace, OpenAI, or Anthropic API keys
-- Manual keys take priority over OAuth tokens
 ### 4. Connect via MCP
 This application exposes a Model Context Protocol (MCP) server, allowing you to use its search tools directly from Claude Desktop or other MCP clients.
@@ -102,122 +79,3 @@ Add this to your `claude_desktop_config.json`:
   }
 }
 ```
-**Available Tools**:
-- `search_pubmed`: Search peer-reviewed biomedical literature.
-- `search_clinical_trials`: Search ClinicalTrials.gov.
-- `search_biorxiv`: Search bioRxiv/medRxiv preprints.
-- `search_all`: Search all sources simultaneously.
-- `analyze_hypothesis`: Secure statistical analysis using Modal sandboxes.
-## Architecture
-DeepCritical uses a Vertical Slice Architecture:
-1.  **Search Slice**: Retrieving evidence from PubMed, ClinicalTrials.gov, and bioRxiv.
-2.  **Judge Slice**: Evaluating evidence quality using LLMs.
-3.  **Orchestrator Slice**: Managing the research loop and UI.
-- iterativeResearch
-- deepResearch
-- researchTeam
-### Iterative Research
-```mermaid
-sequenceDiagram
-    participant IterativeFlow
-    participant ThinkingAgent
-    participant KnowledgeGapAgent
-    participant ToolSelector
-    participant ToolExecutor
-    participant JudgeHandler
-    participant WriterAgent
-    IterativeFlow->>IterativeFlow: run(query)
-    loop Until complete or max_iterations
-        IterativeFlow->>ThinkingAgent: generate_observations()
-        ThinkingAgent-->>IterativeFlow: observations
-        IterativeFlow->>KnowledgeGapAgent: evaluate_gaps()
-        KnowledgeGapAgent-->>IterativeFlow: KnowledgeGapOutput
-        alt Research complete
-            IterativeFlow->>WriterAgent: create_final_report()
-            WriterAgent-->>IterativeFlow: final_report
-        else Gaps remain
-            IterativeFlow->>ToolSelector: select_agents(gap)
-            ToolSelector-->>IterativeFlow: AgentSelectionPlan
-            IterativeFlow->>ToolExecutor: execute_tool_tasks()
-            ToolExecutor-->>IterativeFlow: ToolAgentOutput[]
-            IterativeFlow->>JudgeHandler: assess_evidence()
-            JudgeHandler-->>IterativeFlow: should_continue
-        end
-    end
-```
-### Deep Research
-```mermaid
-sequenceDiagram
-    actor User
-    participant GraphOrchestrator
-    participant InputParser
-    participant GraphBuilder
-    participant GraphExecutor
-    participant Agent
-    participant BudgetTracker
-    participant WorkflowState
-    User->>GraphOrchestrator: run(query)
-    GraphOrchestrator->>InputParser: detect_research_mode(query)
-    InputParser-->>GraphOrchestrator: mode (iterative/deep)
-    GraphOrchestrator->>GraphBuilder: build_graph(mode)
-    GraphBuilder-->>GraphOrchestrator: ResearchGraph
-    GraphOrchestrator->>WorkflowState: init_workflow_state()
-    GraphOrchestrator->>BudgetTracker: create_budget()
-    GraphOrchestrator->>GraphExecutor: _execute_graph(graph)
-    loop For each node in graph
-        GraphExecutor->>Agent: execute_node(agent_node)
-        Agent->>Agent: process_input
-        Agent-->>GraphExecutor: result
-        GraphExecutor->>WorkflowState: update_state(result)
-        GraphExecutor->>BudgetTracker: add_tokens(used)
-        GraphExecutor->>BudgetTracker: check_budget()
-        alt Budget exceeded
-            GraphExecutor->>GraphOrchestrator: emit(error_event)
-        else Continue
-            GraphExecutor->>GraphOrchestrator: emit(progress_event)
-        end
-    end
-    GraphOrchestrator->>User: AsyncGenerator[AgentEvent]
-```
-### Research Team
-Critical Deep Research Agent
-## Development
-### Run Tests
-```bash
-uv run pytest
-```
-### Run Checks
-```bash
-make check
-```
-## Links
-- [GitHub Repository](https://github.com/DeepCritical/GradioDemo)

 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
 [![Documentation](https://img.shields.io/badge/📚%20Docs-0080FF?style=for-the-badge&logo=readthedocs&logoColor=white&labelColor=0080FF&color=0080FF)](docs/index.md)
 [![Demo](https://img.shields.io/badge/🚀%20Demo-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white&labelColor=FFD21E&color=FFD21E)](https://huggingface.co/spaces/DataQuests/DeepCritical)
+[![codecov](https://codecov.io/gh/DeepCritical/GradioDemo/graph/badge.svg?token=B1f05RCGpz)](https://codecov.io/gh/DeepCritical/GradioDemo)
 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)
 </div>
 ## Quick Start
 ### 1. Environment Setup
 pip install uv
 # Sync dependencies
+uv sync --all-extras
 ```
 ### 2. Run the UI
 ```bash
 # Start the Gradio app
+uv run gradio gradio src/app.py
 ```
 Open your browser to `http://localhost:7860`.
 - No need to manually enter API keys when logged in
 - OAuth token is used only for the current session and never stored
 ### 4. Connect via MCP
 This application exposes a Model Context Protocol (MCP) server, allowing you to use its search tools directly from Claude Desktop or other MCP clients.
   }
 }
 ```

.github/workflows/ci.yml CHANGED Viewed

@@ -2,9 +2,9 @@ name: CI
 on:
   push:
-    branches: [main, develop]
   pull_request:
-    branches: [main, develop]
 jobs:
   test:
@@ -28,7 +28,7 @@ jobs:
       - name: Install dependencies
         run: |
-          uv sync --dev
       - name: Lint with ruff
         run: |
@@ -43,25 +43,32 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          uv run pytest tests/unit/ -v -m "not openai and not embedding_provider" --tb=short -p no:logfire
       - name: Run local embeddings tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          uv run pytest tests/ -v -m "local_embeddings" --tb=short -p no:logfire || true
         continue-on-error: true  # Allow failures if dependencies not available
       - name: Run HuggingFace integration tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          uv run pytest tests/integration/ -v -m "huggingface and not embedding_provider" --tb=short -p no:logfire || true
         continue-on-error: true  # Allow failures if HF_TOKEN not set
       - name: Run non-OpenAI integration tests (excluding embedding providers)
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          uv run pytest tests/integration/ -v -m "integration and not openai and not embedding_provider" --tb=short -p no:logfire || true
         continue-on-error: true  # Allow failures if dependencies not available

 on:
   push:
+    branches: [main, dev]
   pull_request:
+    branches: [main, dev]
 jobs:
   test:
       - name: Install dependencies
         run: |
+          uv sync --extra dev
       - name: Lint with ruff
         run: |
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
+          uv run pytest tests/unit/ -v -m "not openai and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml
       - name: Run local embeddings tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
+          uv run pytest tests/ -v -m "local_embeddings" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
         continue-on-error: true  # Allow failures if dependencies not available
       - name: Run HuggingFace integration tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
+          uv run pytest tests/integration/ -v -m "huggingface and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
         continue-on-error: true  # Allow failures if HF_TOKEN not set
       - name: Run non-OpenAI integration tests (excluding embedding providers)
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
+          uv run pytest tests/integration/ -v -m "integration and not openai and not embedding_provider" --tb=short -p no:logfire --cov --cov-branch --cov-report=xml --cov-append || true
         continue-on-error: true  # Allow failures if dependencies not available
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v5
+        continue-on-error: true
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          slug: DeepCritical/GradioDemo

.github/workflows/docs.yml CHANGED Viewed

@@ -32,12 +32,13 @@ jobs:
           python-version: '3.11'
       - name: Install uv
-        run: |
-          pip install uv
       - name: Install dependencies
         run: |
-          uv sync --all-extras --dev
       - name: Build documentation
         run: |
@@ -49,7 +50,9 @@ jobs:
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ./site
           cname: false

           python-version: '3.11'
       - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          version: "latest"
       - name: Install dependencies
         run: |
+          uv sync --extra dev
       - name: Build documentation
         run: |
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ./site
+          publish_branch: gh-pages
           cname: false
+          keep_files: false

.gitignore CHANGED Viewed

@@ -72,6 +72,8 @@ logs/
 .pytest_cache/
 .mypy_cache/
 .coverage
 htmlcov/
 # Database files

 .pytest_cache/
 .mypy_cache/
 .coverage
+.coverage.*
+coverage.xml
 htmlcov/
 # Database files

.pre-commit-hooks/run_pytest_with_sync.py CHANGED Viewed

@@ -1,8 +1,109 @@
 #!/usr/bin/env python3
 """Cross-platform pytest runner that syncs dependencies before running tests."""
 import subprocess
 import sys
 def run_command(
@@ -28,7 +129,6 @@ def run_command(
 def main() -> int:
     """Main entry point."""
     import os
-    from pathlib import Path
     # Get the project root (where pyproject.toml is)
     script_dir = Path(__file__).parent
@@ -37,6 +137,10 @@ def main() -> int:
     # Change to project root to ensure uv works correctly
     os.chdir(project_root)
     # Check if uv is available
     if run_command(["uv", "--version"], check=False) != 0:
         print("Error: uv not found. Please install uv: https://github.com/astral-sh/uv")
@@ -48,8 +152,8 @@ def main() -> int:
     # Sync dependencies - always include dev
     # Note: embeddings dependencies are now in main dependencies, not optional
-    # So we just sync with --dev for all test types
-    sync_cmd = ["uv", "sync", "--dev"]
     print(f"Syncing dependencies for {test_type} tests...")
     if run_command(sync_cmd, cwd=project_root) != 0:
@@ -65,6 +169,7 @@ def main() -> int:
             "--tb=short",
             "-p",
             "no:logfire",
         ]
     elif test_type == "embeddings":
         pytest_args = [
@@ -75,6 +180,7 @@ def main() -> int:
             "--tb=short",
             "-p",
             "no:logfire",
         ]
     else:
         pytest_args = []

 #!/usr/bin/env python3
 """Cross-platform pytest runner that syncs dependencies before running tests."""
+import shutil
 import subprocess
 import sys
+from pathlib import Path
+def clean_caches(project_root: Path) -> None:
+    """Remove pytest and Python cache directories and files.
+    Only scans specific directories (src/, tests/) to avoid resource
+    exhaustion from scanning large directories like .venv on Windows.
+    """
+    # Directories to scan for caches (only project code, not dependencies)
+    scan_dirs = ["src", "tests", ".pre-commit-hooks"]
+    # Directories to exclude (to avoid resource issues)
+    exclude_dirs = {
+        ".venv",
+        "venv",
+        "ENV",
+        "env",
+        ".git",
+        "node_modules",
+        "dist",
+        "build",
+        ".eggs",
+        "reference_repos",
+        "folder",
+    }
+    cache_patterns = [
+        ".pytest_cache",
+        "__pycache__",
+        "*.pyc",
+        "*.pyo",
+        "*.pyd",
+        ".mypy_cache",
+        ".ruff_cache",
+    ]
+    def should_exclude(path: Path) -> bool:
+        """Check if a path should be excluded from cache cleanup."""
+        # Check if any parent directory is in exclude list
+        for parent in path.parents:
+            if parent.name in exclude_dirs:
+                return True
+        # Check if the path itself is excluded
+        if path.name in exclude_dirs:
+            return True
+        return False
+    cleaned = []
+    # Only scan specific directories to avoid resource exhaustion
+    for scan_dir in scan_dirs:
+        scan_path = project_root / scan_dir
+        if not scan_path.exists():
+            continue
+        for pattern in cache_patterns:
+            if "*" in pattern:
+                # Handle glob patterns for files
+                try:
+                    for cache_file in scan_path.rglob(pattern):
+                        if should_exclude(cache_file):
+                            continue
+                        try:
+                            if cache_file.is_file():
+                                cache_file.unlink()
+                                cleaned.append(str(cache_file.relative_to(project_root)))
+                        except OSError:
+                            pass  # Ignore errors (file might be locked or already deleted)
+                except OSError:
+                    pass  # Ignore errors during directory traversal
+            else:
+                # Handle directory patterns
+                try:
+                    for cache_dir in scan_path.rglob(pattern):
+                        if should_exclude(cache_dir):
+                            continue
+                        try:
+                            if cache_dir.is_dir():
+                                shutil.rmtree(cache_dir, ignore_errors=True)
+                                cleaned.append(str(cache_dir.relative_to(project_root)))
+                        except OSError:
+                            pass  # Ignore errors (directory might be locked)
+                except OSError:
+                    pass  # Ignore errors during directory traversal
+    # Also clean root-level caches (like .pytest_cache in project root)
+    for pattern in [".pytest_cache", ".mypy_cache", ".ruff_cache"]:
+        cache_path = project_root / pattern
+        if cache_path.exists() and cache_path.is_dir():
+            try:
+                shutil.rmtree(cache_path, ignore_errors=True)
+                cleaned.append(pattern)
+            except OSError:
+                pass
+    if cleaned:
+        print(f"Cleaned {len(cleaned)} cache items")
+    else:
+        print("No cache files found to clean")
 def run_command(
 def main() -> int:
     """Main entry point."""
     import os
     # Get the project root (where pyproject.toml is)
     script_dir = Path(__file__).parent
     # Change to project root to ensure uv works correctly
     os.chdir(project_root)
+    # Clean caches before running tests
+    print("Cleaning pytest and Python caches...")
+    clean_caches(project_root)
     # Check if uv is available
     if run_command(["uv", "--version"], check=False) != 0:
         print("Error: uv not found. Please install uv: https://github.com/astral-sh/uv")
     # Sync dependencies - always include dev
     # Note: embeddings dependencies are now in main dependencies, not optional
+    # Use --extra dev for [project.optional-dependencies].dev (not --dev which is for [dependency-groups])
+    sync_cmd = ["uv", "sync", "--extra", "dev"]
     print(f"Syncing dependencies for {test_type} tests...")
     if run_command(sync_cmd, cwd=project_root) != 0:
             "--tb=short",
             "-p",
             "no:logfire",
+            "--cache-clear",  # Clear pytest cache before running
         ]
     elif test_type == "embeddings":
         pytest_args = [
             "--tb=short",
             "-p",
             "no:logfire",
+            "--cache-clear",  # Clear pytest cache before running
         ]
     else:
         pytest_args = []

README.md CHANGED Viewed

@@ -35,7 +35,7 @@ tags:
 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
 [![Documentation](https://img.shields.io/badge/📚%20Docs-0080FF?style=for-the-badge&logo=readthedocs&logoColor=white&labelColor=0080FF&color=0080FF)](docs/index.md)
 [![Demo](https://img.shields.io/badge/🚀%20Demo-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white&labelColor=FFD21E&color=FFD21E)](https://huggingface.co/spaces/DataQuests/DeepCritical)
-[![CodeCov](https://img.shields.io/badge/📊%20Coverage-F01F7A?style=for-the-badge&logo=codecov&logoColor=white&labelColor=F01F7A&color=F01F7A)](https://codecov.io/gh/DeepCritical/GradioDemo)
 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)
@@ -76,6 +76,7 @@ For this hackathon we're proposing a simple yet powerful Deep Research Agent tha
 - [] Create Deep Critical Drug Reporposing / Discovery Demo
 - [] Create Deep Critical Literal Review
 - [] Create Deep Critical Hypothesis Generator
 ## Completed
@@ -118,5 +119,5 @@ For this hackathon we're proposing a simple yet powerful Deep Research Agent tha
 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
 [![Documentation](https://img.shields.io/badge/📚%20Docs-0080FF?style=for-the-badge&logo=readthedocs&logoColor=white&labelColor=0080FF&color=0080FF)](docs/index.md)
 [![Demo](https://img.shields.io/badge/🚀%20Demo-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white&labelColor=FFD21E&color=FFD21E)](https://huggingface.co/spaces/DataQuests/DeepCritical)
-[![CodeCov](https://img.shields.io/badge/📊%20Coverage-F01F7A?style=for-the-badge&logo=codecov&logoColor=white&labelColor=F01F7A&color=F01F7A)](https://codecov.io/gh/DeepCritical/GradioDemo)
 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)

 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
 [![Documentation](https://img.shields.io/badge/📚%20Docs-0080FF?style=for-the-badge&logo=readthedocs&logoColor=white&labelColor=0080FF&color=0080FF)](docs/index.md)
 [![Demo](https://img.shields.io/badge/🚀%20Demo-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white&labelColor=FFD21E&color=FFD21E)](https://huggingface.co/spaces/DataQuests/DeepCritical)
+[![codecov](https://codecov.io/gh/DeepCritical/GradioDemo/graph/badge.svg?token=B1f05RCGpz)](https://codecov.io/gh/DeepCritical/GradioDemo)
 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)
 - [] Create Deep Critical Drug Reporposing / Discovery Demo
 - [] Create Deep Critical Literal Review
 - [] Create Deep Critical Hypothesis Generator
+- [] Create PyPi Package
 ## Completed
 [![GitHub](https://img.shields.io/github/stars/DeepCritical/GradioDemo?style=for-the-badge&logo=github&logoColor=white&label=🐙%20GitHub&labelColor=181717&color=181717)](https://github.com/DeepCritical/GradioDemo)
 [![Documentation](https://img.shields.io/badge/📚%20Docs-0080FF?style=for-the-badge&logo=readthedocs&logoColor=white&labelColor=0080FF&color=0080FF)](docs/index.md)
 [![Demo](https://img.shields.io/badge/🚀%20Demo-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white&labelColor=FFD21E&color=FFD21E)](https://huggingface.co/spaces/DataQuests/DeepCritical)
+[![codecov](https://codecov.io/gh/DeepCritical/GradioDemo/graph/badge.svg?token=B1f05RCGpz)](https://codecov.io/gh/DeepCritical/GradioDemo)
 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP)

docs/api/agents.md CHANGED Viewed

	@@ -258,3 +258,6 @@ def create_input_parser_agent(model: Any \| None = None) -> InputParserAgent
258
259
260


258
259
260
261	+
262	+
263	+

docs/api/models.md CHANGED Viewed

	@@ -236,3 +236,6 @@ class BudgetStatus(BaseModel):
236
237
238


236
237
238
239	+
240	+
241	+

docs/api/orchestrators.md CHANGED Viewed

	@@ -183,3 +183,6 @@ Runs Magentic orchestration.
183
184
185


183
184
185
186	+
187	+
188	+

docs/api/services.md CHANGED Viewed

	@@ -189,3 +189,6 @@ Analyzes a hypothesis using statistical methods.
189
190
191


189
190
191
192	+
193	+
194	+

docs/api/tools.md CHANGED Viewed

	@@ -223,3 +223,6 @@ Searches multiple tools in parallel.
223
224
225


223
224
225
226	+
227	+
228	+

docs/architecture/agents.md CHANGED Viewed

	@@ -180,3 +180,6 @@ Factory functions:
180
181
182


180
181
182
183	+
184	+
185	+

docs/architecture/middleware.md CHANGED Viewed

	@@ -130,3 +130,6 @@ All middleware components use `ContextVar` for thread-safe isolation:
130
131
132


130
131
132
133	+
134	+
135	+

docs/architecture/services.md CHANGED Viewed

	@@ -130,3 +130,6 @@ if settings.has_openai_key:
130
131
132


130
131
132
133	+
134	+
135	+

docs/architecture/tools.md CHANGED Viewed

	@@ -163,3 +163,6 @@ search_handler = SearchHandler(
163
164
165


163
164
165
166	+
167	+
168	+

docs/contributing/code-quality.md CHANGED Viewed

	@@ -69,3 +69,6 @@ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
69
70
71


69
70
71
72	+
73	+
74	+

docs/contributing/code-style.md CHANGED Viewed

	@@ -49,3 +49,6 @@ result = await loop.run_in_executor(None, cpu_bound_function, args)
49
50
51


49
50
51
52	+
53	+
54	+

docs/contributing/error-handling.md CHANGED Viewed

	@@ -57,3 +57,6 @@ except httpx.HTTPError as e:
57
58
59


57
58
59
60	+
61	+
62	+

docs/contributing/implementation-patterns.md CHANGED Viewed

	@@ -72,3 +72,6 @@ def get_embedding_service() -> EmbeddingService:
72
73
74


72
73
74
75	+
76	+
77	+

docs/contributing/index.md CHANGED Viewed

	@@ -151,3 +151,6 @@ Thank you for contributing to DeepCritical!
151
152
153


151
152
153
154	+
155	+
156	+

docs/contributing/prompt-engineering.md CHANGED Viewed

	@@ -57,3 +57,6 @@ This document outlines prompt engineering guidelines and citation validation rul
57
58
59


57
58
59
60	+
61	+
62	+

docs/contributing/testing.md CHANGED Viewed

	@@ -53,3 +53,6 @@ async def test_real_pubmed_search():
53
54
55


53
54
55
56	+
57	+
58	+

docs/getting-started/examples.md CHANGED Viewed

	@@ -197,3 +197,6 @@ USE_GRAPH_EXECUTION=true
197
198
199


197
198
199
200	+
201	+
202	+

docs/getting-started/installation.md CHANGED Viewed

	@@ -136,3 +136,6 @@ uv run pre-commit install
136
137
138


136
137
138
139	+
140	+
141	+

docs/getting-started/mcp-integration.md CHANGED Viewed

	@@ -203,3 +203,6 @@ You can configure multiple DeepCritical instances:
203
204
205


203
204
205
206	+
207	+
208	+

docs/getting-started/quick-start.md CHANGED Viewed

	@@ -107,3 +107,6 @@ What are the active clinical trials investigating Alzheimer's disease treatments
107
108
109


107
108
109
110	+
111	+
112	+

docs/license.md CHANGED Viewed

	@@ -27,3 +27,6 @@ SOFTWARE.
27
28
29


27
28
29
30	+
31	+
32	+

docs/overview/architecture.md CHANGED Viewed

	@@ -184,3 +184,6 @@ The system supports complex research workflows through:
184
185
186


184
185
186
187	+
188	+
189	+

docs/overview/features.md CHANGED Viewed

	@@ -136,3 +136,6 @@ DeepCritical provides a comprehensive set of features for AI-assisted research:
136
137
138


136
137
138
139	+
140	+
141	+

docs/team.md CHANGED Viewed

	@@ -32,3 +32,6 @@ We welcome contributions! See the [Contributing Guide](contributing/index.md) fo
32
33
34


32
33
34
35	+
36	+
37	+

mkdocs.yml CHANGED Viewed

@@ -62,8 +62,8 @@ markdown_extensions:
   - pymdownx.tasklist:
       custom_checkbox: true
   - pymdownx.emoji:
-      emoji_index: !!python/name:material.extensions.emoji.twemoji
-      emoji_generator: !!python/name:material.extensions.emoji.to_svg
   - admonition
   - pymdownx.details
   - pymdownx.superfences

   - pymdownx.tasklist:
       custom_checkbox: true
   - pymdownx.emoji:
+      emoji_generator: !!python/name:pymdownx.emoji.to_svg
+      emoji_index: !!python/name:pymdownx.emoji.twemoji
   - admonition
   - pymdownx.details
   - pymdownx.superfences

pyproject.toml CHANGED Viewed

@@ -5,21 +5,16 @@ description = "AI-Native Drug Repurposing Research Agent"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    # Core
     "pydantic>=2.7",
     "pydantic-settings>=2.2",
     "pydantic-ai>=0.0.16",
-    # AI Providers
     "openai>=1.0.0",
     "anthropic>=0.18.0",
-    # HTTP & Parsing
-    "httpx>=0.27",
-    "beautifulsoup4>=4.12",
-    "xmltodict>=0.13",
-    "huggingface-hub>=0.20.0",
-    # UI
-    "gradio[mcp,oauth]>=6.0.0",
-    # Utils
     "python-dotenv>=1.0", # .env loading
     "tenacity>=8.2", # Retry logic
     "structlog>=24.1", # Structured logging
@@ -40,28 +35,30 @@ dependencies = [
     "modal>=0.63.0",
     "llama-index-llms-openai>=0.6.9",
     "llama-index-embeddings-openai>=0.5.1",
 ]
 [project.optional-dependencies]
 dev = [
-    # Testing
-    "pytest>=8.0",
-    "pytest-asyncio>=0.23",
-    "pytest-sugar>=1.0",
-    "pytest-cov>=5.0",
-    "pytest-mock>=3.12",
-    "respx>=0.21",
-    "typer>=0.9.0",
-    # Quality
-    "ruff>=0.4.0",
-    "mypy>=1.10",
     "pre-commit>=3.7",
-    # Documentation
     "mkdocs>=1.5.0",
-    "mkdocs-material>=9.0.0",
-    "mkdocs-mermaid2-plugin>=1.1.0",
-    "mkdocs-minify-plugin>=0.7.0",
 ]
 [build-system]
@@ -106,6 +103,9 @@ ignore = [
     "RUF100",   # Unused noqa (version differences between local/CI)
 ]
 [tool.ruff.lint.isort]
 known-first-party = ["src"]
@@ -123,6 +123,7 @@ exclude = [
     "^reference_repos/",
     "^examples/",
     "^folder/",
 ]
 # ============== PYTEST CONFIG ==============
@@ -136,6 +137,25 @@ addopts = [
     "-p",
     "no:logfire",
 ]
 markers = [
     "unit: Unit tests (mocked)",
     "integration: Integration tests (real APIs)",
@@ -149,7 +169,10 @@ markers = [
 # ============== COVERAGE CONFIG ==============
 [tool.coverage.run]
 source = ["src"]
-omit = ["*/__init__.py"]
 [tool.coverage.report]
 exclude_lines = [
@@ -157,17 +180,3 @@ exclude_lines = [
     "if TYPE_CHECKING:",
     "raise NotImplementedError",
 ]
-[dependency-groups]
-dev = [
-    "mkdocs-codeinclude-plugin>=0.2.1",
-    "mkdocs-macros-plugin>=1.5.0",
-    "pytest>=9.0.1",
-    "pytest-asyncio>=1.3.0",
-    "pytest-cov>=7.0.0",
-    "pytest-mock>=3.15.1",
-    "pytest-sugar>=1.1.1",
-    "respx>=0.22.0",
-    "structlog>=25.5.0",
-    "ty>=0.0.1a28",
-]

 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "pydantic>=2.7",
     "pydantic-settings>=2.2",
     "pydantic-ai>=0.0.16",
     "openai>=1.0.0",
     "anthropic>=0.18.0",
+    "httpx>=0.27",
+    "beautifulsoup4>=4.12",
+    "xmltodict>=0.13",
+    "huggingface-hub>=0.20.0",
+    "gradio[mcp,oauth]>=6.0.0",
     "python-dotenv>=1.0", # .env loading
     "tenacity>=8.2", # Retry logic
     "structlog>=24.1", # Structured logging
     "modal>=0.63.0",
     "llama-index-llms-openai>=0.6.9",
     "llama-index-embeddings-openai>=0.5.1",
+    "pydantic-ai-slim[huggingface]>=0.0.18",
+    "pytest>=9.0.1",
+    "pytest-cov>=7.0.0",
 ]
 [project.optional-dependencies]
 dev = [
+    "pytest>=9.0.1",
+    "pytest-asyncio>=1.3.0",
+    "pytest-sugar>=1.1.1",
+    "pytest-cov>=7.0.0",
+    "pytest-mock>=3.15.1",
+    "respx>=0.22.0",
+    "typer>=0.9.0",
+    "ruff>=0.14.6",
+    "mypy>=1.18.2",
     "pre-commit>=3.7",
     "mkdocs>=1.5.0",
+    "mkdocs-material>=9.7.0",
+    "mkdocs-mermaid2-plugin>=1.2.3",
+    "mkdocs-minify-plugin>=0.8.0",
+    "mkdocs-codeinclude-plugin>=0.2.1",
+    "mkdocs-macros-plugin>=1.5.0",
+    "pymdown-extensions>=10.17.2",
 ]
 [build-system]
     "RUF100",   # Unused noqa (version differences between local/CI)
 ]
+[tool.ruff.lint.per-file-ignores]
+"src/app.py" = ["PLR0915"]  # Too many statements (Gradio UI setup is complex)
 [tool.ruff.lint.isort]
 known-first-party = ["src"]
     "^reference_repos/",
     "^examples/",
     "^folder/",
+    "^src/app\\.py$",  # Gradio UI setup - ignore mypy checks
 ]
 # ============== PYTEST CONFIG ==============
     "-p",
     "no:logfire",
 ]
+# Suppress known warnings that don't indicate test failures
+# These are from third-party libraries and don't affect test correctness
+filterwarnings = [
+    # Pydantic deprecation warnings from unittest.mock introspection
+    # These occur when mock tries to introspect Pydantic models
+    "ignore::pydantic.warnings.PydanticDeprecatedSince20",
+    "ignore::pydantic.warnings.PydanticDeprecatedSince211",
+    # Gradio UI warnings (not relevant for unit tests)
+    "ignore::UserWarning:gradio.components.dropdown",
+    "ignore::UserWarning:gradio.oauth",
+    # Pattern-based filters for Pydantic deprecation messages (catch-all)
+    "ignore:The `__fields__` attribute is deprecated.*",
+    "ignore:The `__fields_set__` attribute is deprecated.*",
+    "ignore:Accessing the 'model_computed_fields' attribute.*",
+    "ignore:Accessing the 'model_fields' attribute.*",
+    # Also catch warnings from unittest.mock module
+    "ignore::DeprecationWarning:unittest.mock",
+]
+# Note: pytest only runs test files, so source files don't need exclusion
 markers = [
     "unit: Unit tests (mocked)",
     "integration: Integration tests (real APIs)",
 # ============== COVERAGE CONFIG ==============
 [tool.coverage.run]
 source = ["src"]
+omit = [
+    "*/__init__.py",
+    "src/app.py",  # Exclude Gradio UI from coverage
+]
 [tool.coverage.report]
 exclude_lines = [
     "if TYPE_CHECKING:",
     "raise NotImplementedError",
 ]

requirements.txt CHANGED Viewed

@@ -26,7 +26,7 @@ beautifulsoup4>=4.12
 xmltodict>=0.13
 # UI (Gradio with MCP server support)
-# gradio[mcp]>=6.0.0
 # Utils
 python-dotenv>=1.0

 xmltodict>=0.13
 # UI (Gradio with MCP server support)
+gradio[mcp]>=6.0.0
 # Utils
 python-dotenv>=1.0

src/agent_factory/judges.py CHANGED Viewed

@@ -2,19 +2,34 @@
 import asyncio
 import json
-from typing import Any, ClassVar
 import structlog
 from huggingface_hub import InferenceClient
 from pydantic_ai import Agent
 from pydantic_ai.models.anthropic import AnthropicModel
-from pydantic_ai.models.huggingface import HuggingFaceModel
-from pydantic_ai.models.openai import OpenAIChatModel as OpenAIModel
-from pydantic_ai.providers.anthropic import AnthropicProvider
-from pydantic_ai.providers.huggingface import HuggingFaceProvider
-from pydantic_ai.providers.openai import OpenAIProvider
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 from src.prompts.judge import (
     SYSTEM_PROMPT,
     format_empty_evidence_prompt,
@@ -35,26 +50,43 @@ def get_model() -> Any:
     llm_provider = settings.llm_provider
     if llm_provider == "anthropic":
-        provider = AnthropicProvider(api_key=settings.anthropic_api_key)
-        return AnthropicModel(settings.anthropic_model, provider=provider)
     if llm_provider == "huggingface":
-        # Free tier - uses HF_TOKEN from environment if available
-        model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-        hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
-        return HuggingFaceModel(model_name, provider=hf_provider)
     if llm_provider == "openai":
-        openai_provider = OpenAIProvider(api_key=settings.openai_api_key)
-        return OpenAIModel(settings.openai_model, provider=openai_provider)
     # Default to HuggingFace if provider is unknown or not specified
     if llm_provider != "huggingface":
         logger.warning("Unknown LLM provider, defaulting to HuggingFace", provider=llm_provider)
-    model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-    hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
-    return HuggingFaceModel(model_name, provider=hf_provider)
 class JudgeHandler:
@@ -72,9 +104,9 @@ class JudgeHandler:
             model: Optional PydanticAI model. If None, uses config default.
         """
         self.model = model or get_model()
-        self.agent = Agent(
             model=self.model,
-            output_type=JudgeAssessment,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
@@ -112,7 +144,7 @@ class JudgeHandler:
         try:
             # Run the agent with structured output
             result = await self.agent.run(user_prompt)
-            assessment = result.output
             logger.info(
                 "Assessment complete",
@@ -121,7 +153,7 @@ class JudgeHandler:
                 confidence=assessment.confidence,
             )
-            return assessment
         except Exception as e:
             logger.error("Assessment failed", error=str(e))
@@ -167,25 +199,58 @@ class JudgeHandler:
 class HFInferenceJudgeHandler:
     """
     JudgeHandler using HuggingFace Inference API for FREE LLM calls.
-    Defaults to Llama-3.1-8B-Instruct (requires HF_TOKEN) or falls back to public models.
     """
-    FALLBACK_MODELS: ClassVar[list[str]] = [
-        "meta-llama/Llama-3.1-8B-Instruct",  # Primary (Gated)
-        "mistralai/Mistral-7B-Instruct-v0.3",  # Secondary
-        "HuggingFaceH4/zephyr-7b-beta",  # Fallback (Ungated)
-    ]
-    def __init__(self, model_id: str | None = None) -> None:
         """
         Initialize with HF Inference client.
         Args:
             model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
         """
         self.model_id = model_id
-        # Will automatically use HF_TOKEN from env if available
-        self.client = InferenceClient()
         self.call_count = 0
         self.last_question: str | None = None
         self.last_evidence: list[Evidence] | None = None
@@ -209,7 +274,7 @@ class HFInferenceJudgeHandler:
         else:
             user_prompt = format_empty_evidence_prompt(question)
-        models_to_try: list[str] = [self.model_id] if self.model_id else self.FALLBACK_MODELS
         last_error: Exception | None = None
         for model in models_to_try:
@@ -261,14 +326,35 @@ IMPORTANT: Respond with ONLY valid JSON matching this schema:
         ]
         # Use chat_completion (conversational task - supported by all models)
         response = await loop.run_in_executor(
             None,
-            lambda: self.client.chat_completion(
-                messages=messages,
-                model=model,
-                max_tokens=1024,
-                temperature=0.1,
-            ),
         )
         # Extract content from response

 import asyncio
 import json
+import os
+from typing import Any
 import structlog
 from huggingface_hub import InferenceClient
 from pydantic_ai import Agent
 from pydantic_ai.models.anthropic import AnthropicModel
+from pydantic_ai.models.openai import OpenAIModel  # type: ignore[attr-defined]
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+# Try to import HuggingFace support (may not be available in all pydantic-ai versions)
+# According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
+# pydantic-ai with huggingface extra or pydantic-ai-slim[huggingface]
+# There are two ways to use HuggingFace:
+# 1. Inference API: HuggingFaceModel with HuggingFaceProvider (uses AsyncInferenceClient internally)
+# 2. Local models: Would use transformers directly (not via pydantic-ai)
+try:
+    from huggingface_hub import AsyncInferenceClient
+    from pydantic_ai.models.huggingface import HuggingFaceModel
+    from pydantic_ai.providers.huggingface import HuggingFaceProvider
+    _HUGGINGFACE_AVAILABLE = True
+except ImportError:
+    HuggingFaceModel = None  # type: ignore[assignment, misc]
+    HuggingFaceProvider = None  # type: ignore[assignment, misc]
+    AsyncInferenceClient = None  # type: ignore[assignment, misc]
+    _HUGGINGFACE_AVAILABLE = False
 from src.prompts.judge import (
     SYSTEM_PROMPT,
     format_empty_evidence_prompt,
     llm_provider = settings.llm_provider
     if llm_provider == "anthropic":
+        return AnthropicModel(settings.anthropic_model, api_key=settings.anthropic_api_key)  # type: ignore[call-arg]
     if llm_provider == "huggingface":
+        if not _HUGGINGFACE_AVAILABLE:
+            raise ImportError(
+                "HuggingFace models are not available in this version of pydantic-ai. "
+                "Please install with: uv add 'pydantic-ai[huggingface]' or use 'openai'/'anthropic' as the LLM provider."
+            )
+        # Inference API - uses HuggingFace Inference API via AsyncInferenceClient
+        # Per https://ai.pydantic.dev/models/huggingface/#configure-the-provider
+        model_name = settings.huggingface_model or "Qwen/Qwen3-Next-80B-A3B-Thinking"
+        # Create AsyncInferenceClient for inference API
+        hf_client = AsyncInferenceClient(api_key=settings.hf_token)  # type: ignore[misc]
+        # Pass client to HuggingFaceProvider for inference API usage
+        provider = HuggingFaceProvider(hf_client=hf_client)  # type: ignore[misc]
+        return HuggingFaceModel(model_name, provider=provider)  # type: ignore[misc]
     if llm_provider == "openai":
+        return OpenAIModel(settings.openai_model, api_key=settings.openai_api_key)  # type: ignore[call-overload]
     # Default to HuggingFace if provider is unknown or not specified
     if llm_provider != "huggingface":
         logger.warning("Unknown LLM provider, defaulting to HuggingFace", provider=llm_provider)
+    if not _HUGGINGFACE_AVAILABLE:
+        raise ImportError(
+            "HuggingFace models are not available in this version of pydantic-ai. "
+            "Please install with: uv add 'pydantic-ai[huggingface]' or set LLM_PROVIDER to 'openai'/'anthropic'."
+        )
+    # Inference API - uses HuggingFace Inference API via AsyncInferenceClient
+    # Per https://ai.pydantic.dev/models/huggingface/#configure-the-provider
+    model_name = settings.huggingface_model or "Qwen/Qwen3-Next-80B-A3B-Thinking"
+    # Create AsyncInferenceClient for inference API
+    hf_client = AsyncInferenceClient(api_key=settings.hf_token)  # type: ignore[misc]
+    # Pass client to HuggingFaceProvider for inference API usage
+    provider = HuggingFaceProvider(hf_client=hf_client)  # type: ignore[misc]
+    return HuggingFaceModel(model_name, provider=provider)  # type: ignore[misc]
 class JudgeHandler:
             model: Optional PydanticAI model. If None, uses config default.
         """
         self.model = model or get_model()
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=JudgeAssessment,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
         try:
             # Run the agent with structured output
             result = await self.agent.run(user_prompt)
+            assessment = result.output  # type: ignore[attr-defined]
             logger.info(
                 "Assessment complete",
                 confidence=assessment.confidence,
             )
+            return assessment  # type: ignore[no-any-return]
         except Exception as e:
             logger.error("Assessment failed", error=str(e))
 class HFInferenceJudgeHandler:
     """
     JudgeHandler using HuggingFace Inference API for FREE LLM calls.
+    Models are loaded from environment variable HF_FALLBACK_MODELS (comma-separated)
+    or use defaults based on currently available inference providers:
+    - meta-llama/Llama-3.1-8B-Instruct (gated, multiple providers)
+    - HuggingFaceH4/zephyr-7b-beta (ungated, featherless-ai)
+    - Qwen/Qwen2-7B-Instruct (ungated, featherless-ai)
+    - google/gemma-2-2b-it (gated, nebius)
     """
+    @classmethod
+    def _get_fallback_models(cls) -> list[str]:
+        """Get fallback models from env var or use defaults."""
+        from src.utils.config import settings
+        # Get from env var or settings
+        models_str = os.getenv("HF_FALLBACK_MODELS") or settings.huggingface_fallback_models
+        # Parse comma-separated list
+        models = [m.strip() for m in models_str.split(",") if m.strip()]
+        # Default fallback if empty
+        if not models:
+            models = [
+                "meta-llama/Llama-3.1-8B-Instruct",  # Primary (Gated, multiple providers)
+                "HuggingFaceH4/zephyr-7b-beta",  # Fallback (Ungated, featherless-ai)
+                "Qwen/Qwen2-7B-Instruct",  # Fallback (Ungated, featherless-ai)
+                "google/gemma-2-2b-it",  # Fallback (Gated, nebius)
+            ]
+        return models
+    def __init__(
+        self,
+        model_id: str | None = None,
+        api_key: str | None = None,
+        provider: str | None = None,
+    ) -> None:
         """
         Initialize with HF Inference client.
         Args:
             model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
+            api_key: Optional HuggingFace API key (OAuth token or HF_TOKEN).
+                     If provided, will use authenticated access for gated models.
+            provider: Optional inference provider name (e.g., "novita", "nebius").
+                     If provided, will use that specific provider.
         """
         self.model_id = model_id
+        self.api_key = api_key
+        self.provider = provider
+        # Use provided API key, or fall back to env var, or use no auth
+        self.client = InferenceClient(token=api_key) if api_key else InferenceClient()
         self.call_count = 0
         self.last_question: str | None = None
         self.last_evidence: list[Evidence] | None = None
         else:
             user_prompt = format_empty_evidence_prompt(question)
+        models_to_try: list[str] = [self.model_id] if self.model_id else self._get_fallback_models()
         last_error: Exception | None = None
         for model in models_to_try:
         ]
         # Use chat_completion (conversational task - supported by all models)
+        # HuggingFace Inference Providers format: "model-id:provider" or use provider parameter
+        # According to docs: https://huggingface.co/docs/inference-providers
+        model_to_use = model
+        provider_param = None
+        if self.provider:
+            # Format: model-id:provider for explicit provider selection
+            model_to_use = f"{model}:{self.provider}"
+            # Alternative: pass provider as separate parameter (if client supports it)
+            provider_param = self.provider
+        # Build chat_completion call
+        call_kwargs = {
+            "messages": messages,
+            "model": model_to_use,
+            "max_tokens": 1024,
+            "temperature": 0.1,
+        }
+        # Add provider parameter if client supports it (some clients use this instead of model:provider)
+        if provider_param and hasattr(self.client.chat_completion, "__code__"):
+            # Check if provider parameter is supported
+            try:
+                call_kwargs["provider"] = provider_param
+            except TypeError:
+                # Provider not supported as parameter, use model:provider format
+                pass
         response = await loop.run_in_executor(
             None,
+            lambda: self.client.chat_completion(**call_kwargs),  # type: ignore[call-overload]
         )
         # Extract content from response

src/agents/hypothesis_agent.py CHANGED Viewed

@@ -40,9 +40,9 @@ class HypothesisAgent(BaseAgent):  # type: ignore[misc]
     def _get_agent(self) -> Agent[None, HypothesisAssessment]:
         """Lazy initialization of LLM agent to avoid requiring API keys at import."""
         if self._agent is None:
-            self._agent = Agent(
                 model=get_model(),  # Uses configured LLM (OpenAI/Anthropic)
-                output_type=HypothesisAssessment,
                 system_prompt=SYSTEM_PROMPT,
             )
         return self._agent

     def _get_agent(self) -> Agent[None, HypothesisAssessment]:
         """Lazy initialization of LLM agent to avoid requiring API keys at import."""
         if self._agent is None:
+            self._agent = Agent(  # type: ignore[call-overload]
                 model=get_model(),  # Uses configured LLM (OpenAI/Anthropic)
+                result_type=HypothesisAssessment,
                 system_prompt=SYSTEM_PROMPT,
             )
         return self._agent

src/agents/input_parser.py CHANGED Viewed

@@ -64,9 +64,9 @@ class InputParserAgent:
         self.logger = logger
         # Initialize Pydantic AI Agent
-        self.agent = Agent(
             model=self.model,
-            output_type=ParsedQuery,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
@@ -117,7 +117,7 @@ class InputParserAgent:
                 questions=len(parsed_query.research_questions),
             )
-            return parsed_query
         except Exception as e:
             self.logger.error("Query parsing failed", error=str(e), query=query[:100])

         self.logger = logger
         # Initialize Pydantic AI Agent
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=ParsedQuery,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
                 questions=len(parsed_query.research_questions),
             )
+            return parsed_query  # type: ignore[no-any-return]
         except Exception as e:
             self.logger.error("Query parsing failed", error=str(e), query=query[:100])

src/agents/judge_agent_llm.py CHANGED Viewed

@@ -16,9 +16,9 @@ class LLMSubIterationJudge:
     def __init__(self) -> None:
         self.model = get_model()
-        self.agent = Agent(
             model=self.model,
-            output_type=JudgeAssessment,
             system_prompt="""You are a strict judge evaluating a research task.
 Evaluate if the result is sufficient to answer the task.
@@ -42,4 +42,4 @@ Evaluate validity and sufficiency."""
         run_result = await self.agent.run(prompt)
         logger.info("LLM judge assessment complete", sufficient=run_result.output.sufficient)
-        return run_result.output

     def __init__(self) -> None:
         self.model = get_model()
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=JudgeAssessment,
             system_prompt="""You are a strict judge evaluating a research task.
 Evaluate if the result is sufficient to answer the task.
         run_result = await self.agent.run(prompt)
         logger.info("LLM judge assessment complete", sufficient=run_result.output.sufficient)
+        return run_result.output  # type: ignore[no-any-return]

src/agents/knowledge_gap.py CHANGED Viewed

@@ -56,9 +56,9 @@ class KnowledgeGapAgent:
         self.logger = logger
         # Initialize Pydantic AI Agent
-        self.agent = Agent(
             model=self.model,
-            output_type=KnowledgeGapOutput,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
@@ -121,7 +121,7 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
                 gaps_count=len(evaluation.outstanding_gaps),
             )
-            return evaluation
         except Exception as e:
             self.logger.error("Knowledge gap evaluation failed", error=str(e))

         self.logger = logger
         # Initialize Pydantic AI Agent
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=KnowledgeGapOutput,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
                 gaps_count=len(evaluation.outstanding_gaps),
             )
+            return evaluation  # type: ignore[no-any-return]
         except Exception as e:
             self.logger.error("Knowledge gap evaluation failed", error=str(e))

src/agents/long_writer.py CHANGED Viewed

@@ -84,9 +84,9 @@ class LongWriterAgent:
         self.logger = logger
         # Initialize Pydantic AI Agent
-        self.agent = Agent(
             model=self.model,
-            output_type=LongWriterOutput,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
@@ -193,7 +193,7 @@ class LongWriterAgent:
                     attempt=attempt + 1,
                 )
-                return output
             except (TimeoutError, ConnectionError) as e:
                 # Transient errors - retry

         self.logger = logger
         # Initialize Pydantic AI Agent
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=LongWriterOutput,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
                     attempt=attempt + 1,
                 )
+                return output  # type: ignore[no-any-return]
             except (TimeoutError, ConnectionError) as e:
                 # Transient errors - retry

src/agents/report_agent.py CHANGED Viewed

@@ -41,9 +41,9 @@ class ReportAgent(BaseAgent):  # type: ignore[misc]
     def _get_agent(self) -> Agent[None, ResearchReport]:
         """Lazy initialization of LLM agent to avoid requiring API keys at import."""
         if self._agent is None:
-            self._agent = Agent(
                 model=get_model(),
-                output_type=ResearchReport,
                 system_prompt=SYSTEM_PROMPT,
             )
         return self._agent

     def _get_agent(self) -> Agent[None, ResearchReport]:
         """Lazy initialization of LLM agent to avoid requiring API keys at import."""
         if self._agent is None:
+            self._agent = Agent(  # type: ignore[call-overload]
                 model=get_model(),
+                result_type=ResearchReport,
                 system_prompt=SYSTEM_PROMPT,
             )
         return self._agent

src/agents/tool_selector.py CHANGED Viewed

@@ -68,9 +68,9 @@ class ToolSelectorAgent:
         self.logger = logger
         # Initialize Pydantic AI Agent
-        self.agent = Agent(
             model=self.model,
-            output_type=AgentSelectionPlan,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
@@ -125,7 +125,7 @@ HISTORY OF ACTIONS, FINDINGS AND THOUGHTS:
                 agents=[task.agent for task in selection_plan.tasks],
             )
-            return selection_plan
         except Exception as e:
             self.logger.error("Tool selection failed", error=str(e))

         self.logger = logger
         # Initialize Pydantic AI Agent
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=AgentSelectionPlan,
             system_prompt=SYSTEM_PROMPT,
             retries=3,
         )
                 agents=[task.agent for task in selection_plan.tasks],
             )
+            return selection_plan  # type: ignore[no-any-return]
         except Exception as e:
             self.logger.error("Tool selection failed", error=str(e))

src/app.py CHANGED Viewed

@@ -5,8 +5,24 @@ from collections.abc import AsyncGenerator
 from typing import Any
 import gradio as gr
-from pydantic_ai.models.huggingface import HuggingFaceModel
-from pydantic_ai.providers.huggingface import HuggingFaceProvider
 from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
@@ -15,6 +31,7 @@ from src.tools.europepmc import EuropePMCTool
 from src.tools.pubmed import PubMedTool
 from src.tools.search_handler import SearchHandler
 from src.utils.config import settings
 from src.utils.models import AgentEvent, OrchestratorConfig
@@ -22,6 +39,8 @@ def configure_orchestrator(
     use_mock: bool = False,
     mode: str = "simple",
     oauth_token: str | None = None,
 ) -> tuple[Any, str]:
     """
     Create an orchestrator instance.
@@ -30,6 +49,8 @@ def configure_orchestrator(
         use_mock: If True, use MockJudgeHandler (no API key needed)
         mode: Orchestrator mode ("simple" or "advanced")
         oauth_token: Optional OAuth token from HuggingFace login
     Returns:
         Tuple of (Orchestrator instance, backend_name)
@@ -59,11 +80,27 @@ def configure_orchestrator(
     # Priority: oauth_token > env vars
     effective_api_key = oauth_token
     if effective_api_key or (os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")):
-        model: HuggingFaceModel | None = None
         if effective_api_key:
-            model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-            hf_provider = HuggingFaceProvider(api_key=effective_api_key)
-            model = HuggingFaceModel(model_name, provider=hf_provider)
             backend_info = "API (HuggingFace OAuth)"
         else:
             backend_info = "API (Env Config)"
@@ -72,8 +109,19 @@ def configure_orchestrator(
     # 3. Free Tier (HuggingFace Inference)
     else:
-        judge_handler = HFInferenceJudgeHandler()
-        backend_info = "Free Tier (Llama 3.1 / Mistral)"
     orchestrator = create_orchestrator(
         search_handler=search_handler,
@@ -332,6 +380,8 @@ async def research_agent(
     message: str,
     history: list[dict[str, Any]],
     mode: str = "simple",
     request: gr.Request | None = None,
 ) -> AsyncGenerator[gr.ChatMessage | list[gr.ChatMessage], None]:
     """
@@ -341,6 +391,8 @@ async def research_agent(
         message: User's research question
         history: Chat history (Gradio format)
         mode: Orchestrator mode ("simple" or "advanced")
         request: Gradio request object containing OAuth information
     Yields:
@@ -372,10 +424,13 @@ async def research_agent(
     try:
         # use_mock=False - let configure_orchestrator decide based on available keys
         # It will use: OAuth token > Env vars > HF Inference (free tier)
         orchestrator, backend_name = configure_orchestrator(
             use_mock=False,  # Never use mock in production - HF Inference is the free fallback
             mode=effective_mode,
             oauth_token=oauth_token,
         )
         yield gr.ChatMessage(
@@ -407,7 +462,162 @@ def create_demo() -> gr.Blocks:
         with gr.Row():
             gr.LoginButton()
-        # Chat interface
         gr.ChatInterface(
             fn=research_agent,
             title="🧬 DeepCritical",
@@ -417,7 +627,7 @@ def create_demo() -> gr.Blocks:
                 "---\n"
                 "*Research tool only — not for medical advice.*  \n"
                 "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
-                "**Sign in with HuggingFace** above to use your account's API token automatically."
             ),
             examples=[
                 ["What drugs could be repurposed for Alzheimer's disease?", "simple"],
@@ -426,14 +636,9 @@ def create_demo() -> gr.Blocks:
             ],
             additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False),
             additional_inputs=[
-                gr.Radio(
-                    choices=["simple", "advanced"],
-                    value="simple",
-                    label="Orchestrator Mode",
-                    info=(
-                        "Simple: Linear (Free Tier Friendly) | Advanced: Multi-Agent (Requires OpenAI - not available without manual config)"
-                    ),
-                ),
             ],
         )

 from typing import Any
 import gradio as gr
+# Try to import HuggingFace support (may not be available in all pydantic-ai versions)
+# According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
+# pydantic-ai with huggingface extra or pydantic-ai-slim[huggingface]
+# There are two ways to use HuggingFace:
+# 1. Inference API: HuggingFaceModel with HuggingFaceProvider (uses AsyncInferenceClient internally)
+# 2. Local models: Would use transformers directly (not via pydantic-ai)
+try:
+    from huggingface_hub import AsyncInferenceClient
+    from pydantic_ai.models.huggingface import HuggingFaceModel
+    from pydantic_ai.providers.huggingface import HuggingFaceProvider
+    _HUGGINGFACE_AVAILABLE = True
+except ImportError:
+    HuggingFaceModel = None  # type: ignore[assignment, misc]
+    HuggingFaceProvider = None  # type: ignore[assignment, misc]
+    AsyncInferenceClient = None  # type: ignore[assignment, misc]
+    _HUGGINGFACE_AVAILABLE = False
 from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
 from src.tools.pubmed import PubMedTool
 from src.tools.search_handler import SearchHandler
 from src.utils.config import settings
+from src.utils.inference_models import get_available_models, get_available_providers
 from src.utils.models import AgentEvent, OrchestratorConfig
     use_mock: bool = False,
     mode: str = "simple",
     oauth_token: str | None = None,
+    hf_model: str | None = None,
+    hf_provider: str | None = None,
 ) -> tuple[Any, str]:
     """
     Create an orchestrator instance.
         use_mock: If True, use MockJudgeHandler (no API key needed)
         mode: Orchestrator mode ("simple" or "advanced")
         oauth_token: Optional OAuth token from HuggingFace login
+        hf_model: Selected HuggingFace model ID
+        hf_provider: Selected inference provider
     Returns:
         Tuple of (Orchestrator instance, backend_name)
     # Priority: oauth_token > env vars
     effective_api_key = oauth_token
     if effective_api_key or (os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")):
+        model: Any | None = None
         if effective_api_key:
+            # Use selected model or fall back to env var/settings
+            model_name = (
+                hf_model
+                or os.getenv("HF_MODEL")
+                or settings.huggingface_model
+                or "Qwen/Qwen3-Next-80B-A3B-Thinking"
+            )
+            if not _HUGGINGFACE_AVAILABLE:
+                raise ImportError(
+                    "HuggingFace models are not available in this version of pydantic-ai. "
+                    "Please install with: uv add 'pydantic-ai[huggingface]' or use 'openai'/'anthropic' as the LLM provider."
+                )
+            # Inference API - uses HuggingFace Inference API via AsyncInferenceClient
+            # Per https://ai.pydantic.dev/models/huggingface/#configure-the-provider
+            # Create AsyncInferenceClient for inference API
+            hf_client = AsyncInferenceClient(api_key=effective_api_key)  # type: ignore[misc]
+            # Pass client to HuggingFaceProvider for inference API usage
+            provider = HuggingFaceProvider(hf_client=hf_client)  # type: ignore[misc]
+            model = HuggingFaceModel(model_name, provider=provider)  # type: ignore[misc]
             backend_info = "API (HuggingFace OAuth)"
         else:
             backend_info = "API (Env Config)"
     # 3. Free Tier (HuggingFace Inference)
     else:
+        # Pass OAuth token if available (even if not in env vars)
+        # This allows OAuth login to work with free tier models
+        # Use selected model and provider if provided
+        judge_handler = HFInferenceJudgeHandler(
+            model_id=hf_model,
+            api_key=oauth_token,
+            provider=hf_provider,
+        )
+        model_display = hf_model.split("/")[-1] if hf_model else "Default"
+        provider_display = hf_provider or "auto"
+        backend_info = f"Free Tier ({model_display} via {provider_display})" + (
+            " (OAuth)" if oauth_token else ""
+        )
     orchestrator = create_orchestrator(
         search_handler=search_handler,
     message: str,
     history: list[dict[str, Any]],
     mode: str = "simple",
+    hf_model: str | None = None,
+    hf_provider: str | None = None,
     request: gr.Request | None = None,
 ) -> AsyncGenerator[gr.ChatMessage | list[gr.ChatMessage], None]:
     """
         message: User's research question
         history: Chat history (Gradio format)
         mode: Orchestrator mode ("simple" or "advanced")
+        hf_model: Selected HuggingFace model ID (from dropdown)
+        hf_provider: Selected inference provider (from dropdown)
         request: Gradio request object containing OAuth information
     Yields:
     try:
         # use_mock=False - let configure_orchestrator decide based on available keys
         # It will use: OAuth token > Env vars > HF Inference (free tier)
+        # hf_model and hf_provider come from dropdown, so they're guaranteed to be valid
         orchestrator, backend_name = configure_orchestrator(
             use_mock=False,  # Never use mock in production - HF Inference is the free fallback
             mode=effective_mode,
             oauth_token=oauth_token,
+            hf_model=hf_model,  # Can be None, will use defaults in configure_orchestrator
+            hf_provider=hf_provider,  # Can be None, will use defaults in configure_orchestrator
         )
         yield gr.ChatMessage(
         with gr.Row():
             gr.LoginButton()
+        # Get initial model/provider lists (no auth by default)
+        # Check if user has auth to determine which model list to use
+        has_auth = bool(os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY"))
+        # Get the appropriate model list based on user's actual auth status
+        # CRITICAL: Use the list that matches the user's auth status to avoid mismatches
+        if has_auth:
+            # User has auth - get models available with auth (includes gated models)
+            initial_models = get_available_models(has_auth=True)
+            # Fallback to unauthenticated models if auth list is empty (shouldn't happen, but be safe)
+            if not initial_models:
+                initial_models = get_available_models(has_auth=False)
+        else:
+            # User doesn't have auth - only get unauthenticated models (ungated only)
+            initial_models = get_available_models(has_auth=False)
+        # Extract available model IDs (first element of tuples) - this is what Gradio uses as values
+        available_model_ids = [m[0] for m in initial_models] if initial_models else []
+        # Prefer latest reasoning models if available, otherwise use fallback
+        preferred_models = [
+            "Qwen/Qwen3-Next-80B-A3B-Thinking",
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "meta-llama/Llama-3.3-70B-Instruct",
+        ]
+        # Find first available preferred model from the actual available models list
+        # CRITICAL: Only use models that are actually in available_model_ids
+        initial_model_id = None
+        for preferred in preferred_models:
+            if preferred in available_model_ids:
+                initial_model_id = preferred
+                break
+        # Fall back to first available model from the actual list
+        # CRITICAL: Always use a model that's guaranteed to be in available_model_ids
+        if not initial_model_id:
+            if available_model_ids:
+                initial_model_id = available_model_ids[0]  # First model ID from available list
+            else:
+                # No models available - this shouldn't happen, but handle gracefully
+                initial_model_id = None
+        # Final safety check: ensure initial_model_id is actually in the available models
+        # This is the last line of defense - if it's not in the list, use the first available
+        if initial_model_id and initial_model_id not in available_model_ids:
+            if available_model_ids:
+                initial_model_id = available_model_ids[0]
+            else:
+                initial_model_id = None
+        # Get providers for the selected model (only if we have a valid model)
+        initial_providers = []
+        initial_provider = None
+        if initial_model_id:
+            initial_providers = get_available_providers(initial_model_id, has_auth=has_auth)
+            # Ensure we have a valid provider value that's in the choices
+            if initial_providers:
+                initial_provider = initial_providers[0][0]  # Use first provider's ID
+                # Safety check: ensure provider is in the list
+                available_provider_ids = [p[0] for p in initial_providers]
+                if initial_provider not in available_provider_ids:
+                    initial_provider = initial_providers[0][0] if initial_providers else None
+        # Create dropdowns for model and provider selection
+        # Note: Components can be in a hidden row and still work with ChatInterface additional_inputs
+        # The visible=False just hides the row itself, but components are still accessible
+        with gr.Row(visible=False):
+            mode_radio = gr.Radio(
+                choices=["simple", "advanced"],
+                value="simple",
+                label="Orchestrator Mode",
+                info="Simple: Linear | Advanced: Multi-Agent (Requires OpenAI)",
+            )
+            # Final validation: ensure value is in choices before creating dropdown
+            # Gradio requires the value to be exactly one of the choice values (first element of tuples)
+            # CRITICAL: Always default to the first available choice to ensure value is always valid
+            # Extract model IDs from choices (first element of each tuple)
+            model_ids_in_choices = [m[0] for m in initial_models] if initial_models else []
+            # Determine the model value - must be in model_ids_in_choices
+            if initial_models and model_ids_in_choices:
+                # First try to use initial_model_id if it's valid
+                if initial_model_id and initial_model_id in model_ids_in_choices:
+                    model_value = initial_model_id
+                else:
+                    # Fallback to first available model - guarantees a valid value
+                    model_value = model_ids_in_choices[0]
+            else:
+                # No models available - set to None (empty dropdown)
+                model_value = None
+            # Absolute final check: if we have choices but model_value is None or invalid, use first choice
+            if initial_models and model_ids_in_choices:
+                if not model_value or model_value not in model_ids_in_choices:
+                    model_value = model_ids_in_choices[0]
+            hf_model_dropdown = gr.Dropdown(
+                choices=initial_models if initial_models else [],
+                value=model_value,  # Always set to a valid value from choices (or None if empty)
+                label="🤖 Reasoning Model",
+                info="Select AI model for evidence assessment. Sign in to access gated models.",
+                interactive=True,
+                allow_custom_value=False,  # Only allow values from choices
+            )
+            # Final validation for provider: ensure value is in choices
+            # CRITICAL: Always default to the first available choice to ensure value is always valid
+            provider_ids_in_choices = [p[0] for p in initial_providers] if initial_providers else []
+            provider_value = None
+            if initial_providers and provider_ids_in_choices:
+                # First try to use the preferred provider if it's available
+                if initial_provider and initial_provider in provider_ids_in_choices:
+                    provider_value = initial_provider
+                else:
+                    # Fallback to first available provider - this ensures we always have a valid value
+                    provider_value = provider_ids_in_choices[0]
+            # Absolute final check: if we have choices but provider_value is None or invalid, use first choice
+            if initial_providers and provider_ids_in_choices:
+                if not provider_value or provider_value not in provider_ids_in_choices:
+                    provider_value = provider_ids_in_choices[0]
+            hf_provider_dropdown = gr.Dropdown(
+                choices=initial_providers if initial_providers else [],
+                value=provider_value,  # Always set to a valid value from choices (or None if empty)
+                label="⚡ Inference Provider",
+                info="Select provider for model execution. Some require authentication.",
+                interactive=True,
+                allow_custom_value=False,  # Only allow values from choices
+            )
+        # Update providers when model changes
+        def update_providers(model_id: str, request: gr.Request | None = None) -> gr.Dropdown:
+            """Update provider list when model changes."""
+            # Check if user is authenticated
+            oauth_token, _ = extract_oauth_info(request)
+            has_auth = bool(
+                oauth_token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
+            )
+            providers = get_available_providers(model_id, has_auth=has_auth)
+            if providers:
+                # Always set value to first provider to ensure it's valid
+                return gr.Dropdown(choices=providers, value=providers[0][0])
+            # If no providers, return empty dropdown with no value
+            return gr.Dropdown(choices=[], value=None)
+        hf_model_dropdown.change(
+            fn=update_providers,
+            inputs=[hf_model_dropdown],
+            outputs=[hf_provider_dropdown],
+        )
+        # Chat interface with model/provider selection
         gr.ChatInterface(
             fn=research_agent,
             title="🧬 DeepCritical",
                 "---\n"
                 "*Research tool only — not for medical advice.*  \n"
                 "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
+                "**Sign in with HuggingFace** above to access premium models and providers."
             ),
             examples=[
                 ["What drugs could be repurposed for Alzheimer's disease?", "simple"],
             ],
             additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=False),
             additional_inputs=[
+                mode_radio,
+                hf_model_dropdown,
+                hf_provider_dropdown,
             ],
         )

src/legacy_orchestrator.py CHANGED Viewed

@@ -101,12 +101,26 @@ class Orchestrator:
             return evidence
         try:
-            # Deduplicate using semantic similarity
-            unique_evidence: list[Evidence] = await embeddings.deduplicate(evidence, threshold=0.85)
             logger.info(
                 "Deduplicated evidence",
                 before=len(evidence),
-                after=len(unique_evidence),
             )
             return unique_evidence
         except Exception as e:

             return evidence
         try:
+            # First, deduplicate by URL (exact duplicates) from current evidence batch
+            # This prevents the same URL from appearing multiple times in one batch
+            seen_urls: set[str] = set()
+            unique_by_url: list[Evidence] = []
+            for e in evidence:
+                if e.citation.url not in seen_urls:
+                    unique_by_url.append(e)
+                    seen_urls.add(e.citation.url)
+            # Then, deduplicate using semantic similarity with stricter threshold
+            # threshold=0.95 means only remove near-identical content (distance < 0.05)
+            # This prevents over-filtering while still removing true duplicates
+            unique_evidence: list[Evidence] = await embeddings.deduplicate(
+                unique_by_url, threshold=0.95
+            )
             logger.info(
                 "Deduplicated evidence",
                 before=len(evidence),
+                after_url=len(unique_by_url),
+                after_semantic=len(unique_evidence),
             )
             return unique_evidence
         except Exception as e:

src/orchestrator/planner_agent.py CHANGED Viewed

@@ -80,9 +80,9 @@ class PlannerAgent:
             raise ConfigurationError("crawl_tool must be callable")
         # Initialize Pydantic AI Agent
-        self.agent = Agent(
             model=self.model,
-            output_type=ReportPlan,
             system_prompt=SYSTEM_PROMPT,
             tools=[self.web_search_tool, self.crawl_tool],
             retries=3,
@@ -136,7 +136,7 @@ class PlannerAgent:
                 has_background=bool(report_plan.background_context),
             )
-            return report_plan
         except Exception as e:
             self.logger.error("Planning failed", error=str(e), query=query[:100])

             raise ConfigurationError("crawl_tool must be callable")
         # Initialize Pydantic AI Agent
+        self.agent = Agent(  # type: ignore[call-overload]
             model=self.model,
+            result_type=ReportPlan,
             system_prompt=SYSTEM_PROMPT,
             tools=[self.web_search_tool, self.crawl_tool],
             retries=3,
                 has_background=bool(report_plan.background_context),
             )
+            return report_plan  # type: ignore[no-any-return]
         except Exception as e:
             self.logger.error("Planning failed", error=str(e), query=query[:100])

src/services/llamaindex_rag.py CHANGED Viewed

@@ -202,7 +202,7 @@ class LlamaIndexRAGService:
     def _configure_llm(self, huggingface_llm: Any, openai_llm: Any) -> None:
         """Configure LLM for query synthesis."""
         if huggingface_llm is not None and (settings.hf_token or settings.huggingface_api_key):
-            model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
             token = settings.hf_token or settings.huggingface_api_key
             # Check if it's HuggingFaceInferenceAPI (API-based) or HuggingFaceLLM (local)

     def _configure_llm(self, huggingface_llm: Any, openai_llm: Any) -> None:
         """Configure LLM for query synthesis."""
         if huggingface_llm is not None and (settings.hf_token or settings.huggingface_api_key):
+            model_name = settings.huggingface_model or "Qwen/Qwen3-Next-80B-A3B-Thinking"
             token = settings.hf_token or settings.huggingface_api_key
             # Check if it's HuggingFaceInferenceAPI (API-based) or HuggingFaceLLM (local)

src/services/statistical_analyzer.py CHANGED Viewed

@@ -71,9 +71,9 @@ class StatisticalAnalyzer:
         """Lazy initialization of LLM agent for code generation."""
         if self._agent is None:
             library_versions = get_sandbox_library_prompt()
-            self._agent = Agent(
                 model=get_model(),
-                output_type=str,
                 system_prompt=f"""You are a biomedical data scientist.
 Generate Python code to analyze research evidence and test hypotheses.

         """Lazy initialization of LLM agent for code generation."""
         if self._agent is None:
             library_versions = get_sandbox_library_prompt()
+            self._agent = Agent(  # type: ignore[call-overload]
                 model=get_model(),
+                result_type=str,
                 system_prompt=f"""You are a biomedical data scientist.
 Generate Python code to analyze research evidence and test hypotheses.

src/utils/config.py CHANGED Viewed

@@ -41,8 +41,9 @@ class Settings(BaseSettings):
         description="OpenAI embedding model (used by LlamaIndex RAG)",
     )
     local_embedding_model: str = Field(
-        default="all-MiniLM-L6-v2",
-        description="Local sentence-transformers model (used by EmbeddingService)",
     )
     embedding_provider: Literal["openai", "local", "huggingface"] = Field(
         default="local",
@@ -58,8 +59,15 @@ class Settings(BaseSettings):
         default=None, description="HuggingFace API token (HF_TOKEN or HUGGINGFACE_API_KEY)"
     )
     huggingface_model: str = Field(
-        default="meta-llama/Llama-3.1-8B-Instruct",
-        description="Default HuggingFace model ID for inference",
     )
     # PubMed Configuration

         description="OpenAI embedding model (used by LlamaIndex RAG)",
     )
     local_embedding_model: str = Field(
+        default="BAAI/bge-small-en-v1.5",
+        description="Local sentence-transformers model (used by EmbeddingService). "
+        "BAAI/bge-small-en-v1.5 is newer, faster, and better than all-MiniLM-L6-v2.",
     )
     embedding_provider: Literal["openai", "local", "huggingface"] = Field(
         default="local",
         default=None, description="HuggingFace API token (HF_TOKEN or HUGGINGFACE_API_KEY)"
     )
     huggingface_model: str = Field(
+        default="Qwen/Qwen3-Next-80B-A3B-Thinking",
+        description="Default HuggingFace model ID for inference (gated, requires auth). "
+        "Latest reasoning model with advanced thinking capabilities.",
+    )
+    huggingface_fallback_models: str = Field(
+        default="Qwen/Qwen3-Next-80B-A3B-Thinking,Qwen/Qwen3-Next-80B-A3B-Instruct,meta-llama/Llama-3.3-70B-Instruct,meta-llama/Llama-3.1-8B-Instruct,HuggingFaceH4/zephyr-7b-beta,Qwen/Qwen2-7B-Instruct",
+        description="Comma-separated list of fallback HuggingFace models for inference API. "
+        "Models are tried in order until one succeeds. "
+        "Default: Latest reasoning models (Qwen3-Next, Llama-3.3) followed by reliable fallbacks.",
     )
     # PubMed Configuration

src/utils/huggingface_chat_client.py CHANGED Viewed

@@ -28,14 +28,14 @@ class HuggingFaceChatClient:
     def __init__(
         self,
-        model_name: str = "meta-llama/Llama-3.1-8B-Instruct",
         api_key: str | None = None,
         provider: str = "auto",
     ) -> None:
         """Initialize HuggingFace chat client.
         Args:
-            model_name: HuggingFace model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct")
             api_key: Optional HF_TOKEN for gated models. If None, uses environment token.
             provider: Provider name or "auto" for automatic selection.
                      Options: "auto", "cerebras", "together", "sambanova", etc.

     def __init__(
         self,
+        model_name: str = "Qwen/Qwen3-Next-80B-A3B-Thinking",
         api_key: str | None = None,
         provider: str = "auto",
     ) -> None:
         """Initialize HuggingFace chat client.
         Args:
+            model_name: HuggingFace model identifier (e.g., "Qwen/Qwen3-Next-80B-A3B-Thinking")
             api_key: Optional HF_TOKEN for gated models. If None, uses environment token.
             provider: Provider name or "auto" for automatic selection.
                      Options: "auto", "cerebras", "together", "sambanova", etc.

src/utils/inference_models.py ADDED Viewed

	@@ -0,0 +1,627 @@

+"""Configuration for HuggingFace Inference Providers models.
+Based on: https://huggingface.co/inference/models
+This module provides model and provider configurations with verification
+capabilities to ensure models are actually available on selected providers.
+"""
+from typing import TypedDict
+class ModelProvider(TypedDict):
+    """Provider information for a model."""
+    name: str
+    input_cost: float | None  # $/1M tokens
+    output_cost: float | None  # $/1M tokens
+    latency: float | None  # seconds
+    throughput: float | None  # tokens/second
+    supports_tools: bool
+    supports_structured: bool
+    requires_auth: bool  # Whether this provider requires authentication
+class InferenceModel(TypedDict):
+    """Model configuration with available providers."""
+    model_id: str
+    display_name: str
+    providers: dict[str, ModelProvider]
+    requires_auth: bool  # Whether the model itself requires authentication (gated)
+    description: str
+# Latest Reasoning Models from https://huggingface.co/inference/models
+# Updated with latest reasoning models (Qwen3-Next, Qwen3-235B, Llama-3.3, etc.)
+INFERENCE_MODELS: dict[str, InferenceModel] = {
+    # Top-tier reasoning models (latest)
+    "Qwen/Qwen3-Next-80B-A3B-Thinking": {
+        "model_id": "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        "display_name": "Qwen3-Next-80B-A3B-Thinking",
+        "requires_auth": True,  # Gated
+        "description": "Qwen's latest reasoning model - Advanced thinking capabilities, 262K context",
+        "providers": {
+            "together": {
+                "name": "together",
+                "input_cost": 0.15,
+                "output_cost": 1.5,
+                "latency": 0.48,
+                "throughput": 202.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+            "together-fastest": {
+                "name": "together-fastest",
+                "input_cost": 0.15,
+                "output_cost": 1.5,
+                "latency": 0.48,
+                "throughput": 202.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+        },
+    },
+    "Qwen/Qwen3-Next-80B-A3B-Instruct": {
+        "model_id": "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        "display_name": "Qwen3-Next-80B-A3B-Instruct",
+        "requires_auth": True,  # Gated
+        "description": "Qwen's latest instruction model - High performance, 262K context",
+        "providers": {
+            "together": {
+                "name": "together",
+                "input_cost": 0.15,
+                "output_cost": 1.5,
+                "latency": 0.60,
+                "throughput": 153.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+            "together-fastest": {
+                "name": "together-fastest",
+                "input_cost": 0.15,
+                "output_cost": 1.5,
+                "latency": 0.60,
+                "throughput": 153.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+        },
+    },
+    "Qwen/Qwen3-235B-A22B-Instruct-2507": {
+        "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507",
+        "display_name": "Qwen3-235B-A22B-Instruct",
+        "requires_auth": True,  # Gated
+        "description": "Qwen's massive 235B model - Ultra-high performance, 262K context",
+        "providers": {
+            "cerebras": {
+                "name": "cerebras",
+                "input_cost": 0.6,
+                "output_cost": 1.2,
+                "latency": 0.23,
+                "throughput": 509.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "cerebras-fastest": {
+                "name": "cerebras-fastest",
+                "input_cost": 0.6,
+                "output_cost": 1.2,
+                "latency": 0.23,
+                "throughput": 509.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "together": {
+                "name": "together",
+                "input_cost": 0.2,
+                "output_cost": 0.6,
+                "latency": 0.39,
+                "throughput": 42.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+        },
+    },
+    "Qwen/Qwen3-235B-A22B-Thinking-2507": {
+        "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507",
+        "display_name": "Qwen3-235B-A22B-Thinking",
+        "requires_auth": True,  # Gated
+        "description": "Qwen's massive 235B reasoning model - Advanced thinking, 262K context",
+        "providers": {
+            "cerebras": {
+                "name": "cerebras",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": None,
+                "throughput": None,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "meta-llama/Llama-3.3-70B-Instruct": {
+        "model_id": "meta-llama/Llama-3.3-70B-Instruct",
+        "display_name": "Llama 3.3 70B Instruct",
+        "requires_auth": True,  # Gated
+        "description": "Meta's latest Llama 3.3 - High performance, tools support",
+        "providers": {
+            "cerebras": {
+                "name": "cerebras",
+                "input_cost": 0.85,
+                "output_cost": 1.2,
+                "latency": 0.35,
+                "throughput": 948.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "cerebras-fastest": {
+                "name": "cerebras-fastest",
+                "input_cost": 0.85,
+                "output_cost": 1.2,
+                "latency": 0.35,
+                "throughput": 948.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "openai/gpt-oss-120b": {
+        "model_id": "openai/gpt-oss-120b",
+        "display_name": "GPT-OSS-120B",
+        "requires_auth": True,  # Gated
+        "description": "OpenAI's open-source 120B model - Ultra-fast inference",
+        "providers": {
+            "cerebras": {
+                "name": "cerebras",
+                "input_cost": 0.25,
+                "output_cost": 0.69,
+                "latency": 0.23,
+                "throughput": 1051.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "cerebras-fastest": {
+                "name": "cerebras-fastest",
+                "input_cost": 0.25,
+                "output_cost": 0.69,
+                "latency": 0.23,
+                "throughput": 1051.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "CohereLabs/command-a-reasoning-08-2025": {
+        "model_id": "CohereLabs/command-a-reasoning-08-2025",
+        "display_name": "Command A Reasoning 08-2025",
+        "requires_auth": True,  # Gated
+        "description": "Cohere's latest reasoning model - Specialized for reasoning tasks",
+        "providers": {
+            "cohere": {
+                "name": "cohere",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": 0.18,
+                "throughput": 94.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "zai-org/GLM-4.6": {
+        "model_id": "zai-org/GLM-4.6",
+        "display_name": "GLM-4.6",
+        "requires_auth": True,  # Gated
+        "description": "ZAI's GLM-4.6 - High performance reasoning model",
+        "providers": {
+            "cerebras": {
+                "name": "cerebras",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": 0.27,
+                "throughput": 381.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "cerebras-fastest": {
+                "name": "cerebras-fastest",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": 0.27,
+                "throughput": 381.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "zai-org": {
+                "name": "zai-org",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": 3.08,
+                "throughput": 54.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "meta-llama/Llama-3.1-8B-Instruct": {
+        "model_id": "meta-llama/Llama-3.1-8B-Instruct",
+        "display_name": "Llama 3.1 8B Instruct",
+        "requires_auth": True,  # Gated
+        "description": "Meta's Llama 3.1 8B - Fast, efficient reasoning",
+        "providers": {
+            "novita": {
+                "name": "novita",
+                "input_cost": 0.02,
+                "output_cost": 0.05,
+                "latency": 0.64,
+                "throughput": 84.0,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "nebius": {
+                "name": "nebius",
+                "input_cost": 0.03,
+                "output_cost": 0.09,
+                "latency": 0.35,
+                "throughput": 194.0,
+                "supports_tools": False,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+            "cerebras": {
+                "name": "cerebras",
+                "input_cost": 0.1,
+                "output_cost": 0.1,
+                "latency": 0.33,
+                "throughput": 1148.0,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "sambanova": {
+                "name": "sambanova",
+                "input_cost": 0.1,
+                "output_cost": 0.2,
+                "latency": 0.85,
+                "throughput": 527.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+        },
+    },
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {
+        "model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+        "display_name": "DeepSeek R1 Distill Llama 70B",
+        "requires_auth": True,  # Gated
+        "description": "DeepSeek's reasoning model - Advanced chain-of-thought",
+        "providers": {
+            "novita": {
+                "name": "novita",
+                "input_cost": 0.64,
+                "output_cost": 0.64,
+                "latency": 1.21,
+                "throughput": 31.0,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "sambanova": {
+                "name": "sambanova",
+                "input_cost": 0.7,
+                "output_cost": 1.4,
+                "latency": 2.67,
+                "throughput": 158.0,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "nscale": {
+                "name": "nscale",
+                "input_cost": 0.75,
+                "output_cost": 0.75,
+                "latency": 1.24,
+                "throughput": 16.0,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "moonshotai/Kimi-K2-Thinking": {
+        "model_id": "moonshotai/Kimi-K2-Thinking",
+        "display_name": "Kimi K2 Thinking",
+        "requires_auth": True,  # Gated
+        "description": "Moonshot AI's thinking model - Long context reasoning",
+        "providers": {
+            "novita": {
+                "name": "novita",
+                "input_cost": 0.48,
+                "output_cost": 2.0,
+                "latency": 1.60,
+                "throughput": 16.0,
+                "supports_tools": True,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+            "nebius": {
+                "name": "nebius",
+                "input_cost": 0.6,
+                "output_cost": 2.5,
+                "latency": 0.34,
+                "throughput": 87.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+            "together": {
+                "name": "together",
+                "input_cost": 1.2,
+                "output_cost": 4.0,
+                "latency": 0.86,
+                "throughput": 97.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": True,
+            },
+        },
+    },
+    "allenai/Olmo-3-7B-Instruct": {
+        "model_id": "allenai/Olmo-3-7B-Instruct",
+        "display_name": "Olmo 3 7B Instruct",
+        "requires_auth": False,  # Ungated
+        "description": "AllenAI's open model - Good reasoning, no auth needed",
+        "providers": {
+            "publicai": {
+                "name": "publicai",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": 1.78,
+                "throughput": 36.0,
+                "supports_tools": True,
+                "supports_structured": True,
+                "requires_auth": False,
+            },
+        },
+    },
+    "Qwen/Qwen2-7B-Instruct": {
+        "model_id": "Qwen/Qwen2-7B-Instruct",
+        "display_name": "Qwen2 7B Instruct",
+        "requires_auth": False,  # Ungated
+        "description": "Qwen's efficient model - Fast, no authentication",
+        "providers": {
+            "featherless-ai": {
+                "name": "featherless-ai",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": None,
+                "throughput": None,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": False,
+            },
+        },
+    },
+    "HuggingFaceH4/zephyr-7b-beta": {
+        "model_id": "HuggingFaceH4/zephyr-7b-beta",
+        "display_name": "Zephyr 7B Beta",
+        "requires_auth": False,  # Ungated
+        "description": "HuggingFace's fine-tuned model - Free tier friendly",
+        "providers": {
+            "featherless-ai": {
+                "name": "featherless-ai",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": None,
+                "throughput": None,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": False,
+            },
+        },
+    },
+    "google/gemma-2-2b-it": {
+        "model_id": "google/gemma-2-2b-it",
+        "display_name": "Gemma 2 2B IT",
+        "requires_auth": True,  # Gated
+        "description": "Google's compact model - Small but capable",
+        "providers": {
+            "nebius": {
+                "name": "nebius",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": None,
+                "throughput": None,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": True,
+            },
+        },
+    },
+    "microsoft/Phi-3-mini-4k-instruct": {
+        "model_id": "microsoft/Phi-3-mini-4k-instruct",
+        "display_name": "Phi-3 Mini 4K Instruct",
+        "requires_auth": False,  # Ungated
+        "description": "Microsoft's efficient model - Fast inference",
+        "providers": {
+            "featherless-ai": {
+                "name": "featherless-ai",
+                "input_cost": None,
+                "output_cost": None,
+                "latency": None,
+                "throughput": None,
+                "supports_tools": False,
+                "supports_structured": False,
+                "requires_auth": False,
+            },
+        },
+    },
+}
+def get_available_models(has_auth: bool = False) -> list[tuple[str, str]]:
+    """
+    Get list of available models based on authentication status.
+    Args:
+        has_auth: Whether user has authentication (OAuth or HF_TOKEN)
+    Returns:
+        List of (model_id, display_name) tuples for dropdown
+    """
+    models = []
+    for model_id, model_info in INFERENCE_MODELS.items():
+        # If no auth, only show ungated models
+        if not has_auth and model_info["requires_auth"]:
+            continue
+        models.append((model_id, model_info["display_name"]))
+    return models
+def get_available_providers(model_id: str, has_auth: bool = False) -> list[tuple[str, str]]:
+    """
+    Get list of available providers for a model based on authentication.
+    This is a convenience wrapper around get_available_providers_verified
+    that doesn't perform async verification.
+    Args:
+        model_id: The model ID
+        has_auth: Whether user has authentication
+    Returns:
+        List of (provider_name, display_name) tuples for dropdown
+    """
+    return get_available_providers_verified(model_id, has_auth=has_auth, verify=False)
+def get_model_info(model_id: str) -> InferenceModel | None:
+    """Get model information."""
+    return INFERENCE_MODELS.get(model_id)
+def get_provider_info(model_id: str, provider_name: str) -> ModelProvider | None:
+    """Get provider information for a model."""
+    model = INFERENCE_MODELS.get(model_id)
+    if not model:
+        return None
+    return model["providers"].get(provider_name)
+def verify_provider_availability(
+    model_id: str,
+    provider_name: str,
+) -> bool:
+    """
+    Verify that a model is available on the specified provider (static check).
+    This function checks the static configuration to see if a provider
+    is listed for the model. For dynamic verification via API calls,
+    use verify_provider_availability_async().
+    Args:
+        model_id: The model ID to verify
+        provider_name: The provider name to verify
+    Returns:
+        True if the model is configured for the provider, False otherwise
+    """
+    model_config = INFERENCE_MODELS.get(model_id)
+    if not model_config:
+        return False
+    providers = model_config.get("providers", {})
+    return provider_name in providers
+async def verify_provider_availability_async(
+    model_id: str,
+    provider_name: str,
+    api_key: str | None = None,
+) -> bool:
+    """
+    Verify that a model is actually available on the specified provider via API.
+    This function attempts to check if the model/provider combination is valid
+    by making a lightweight API call to the HuggingFace Inference API.
+    Note: This is an async function and should be called from an async context.
+    For synchronous checks, use verify_provider_availability().
+    Args:
+        model_id: The model ID to verify
+        provider_name: The provider name to verify
+        api_key: Optional API key for authentication (uses env vars if not provided)
+    Returns:
+        True if the model is available on the provider, False otherwise
+    """
+    # For now, fall back to static check
+    # TODO: Implement actual API verification when needed
+    return verify_provider_availability(model_id, provider_name)
+def get_available_providers_verified(
+    model_id: str,
+    has_auth: bool = False,
+    api_key: str | None = None,
+    verify: bool = False,
+) -> list[tuple[str, str]]:
+    """
+    Get list of available providers for a model with optional verification.
+    Args:
+        model_id: The model ID
+        has_auth: Whether user has authentication
+        api_key: Optional API key for verification
+        verify: Whether to verify provider availability (async, requires api_key)
+    Returns:
+        List of (provider_name, display_name) tuples for dropdown
+    """
+    if model_id not in INFERENCE_MODELS:
+        return []
+    model = INFERENCE_MODELS[model_id]
+    providers = []
+    for provider_name, provider_info in model["providers"].items():
+        # If no auth, only show providers that don't require auth
+        if not has_auth and provider_info["requires_auth"]:
+            continue
+        # Create display name with cost/latency info
+        display_parts = [provider_name]
+        if provider_info["latency"]:
+            display_parts.append(f"{provider_info['latency']:.2f}s")
+        if provider_info["input_cost"]:
+            display_parts.append(f"${provider_info['input_cost']}/1M")
+        if provider_info["supports_tools"]:
+            display_parts.append("🔧")
+        if provider_info["supports_structured"]:
+            display_parts.append("📊")
+        display_name = " | ".join(display_parts)
+        providers.append((provider_name, display_name))
+    # Note: If verify=True, this should be called from an async context
+    # For now, we return static providers. Async verification can be done separately.
+    return providers