Spaces:

MCP-1st-Birthday
/

DETERMINATOR

Running

App Files Files Community

Joseph Pollack commited on 10 days ago

Commit

0467062

unverified ·

1 Parent(s): 1ea3854

adds new features and graphs integration with configuration options

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +54 -0
WEB_SEARCH_TOOL_ASSESSMENT.md +239 -0
docs/api/agents.md +2 -0
docs/api/models.md +2 -0
docs/api/orchestrators.md +2 -0
docs/api/services.md +2 -0
docs/api/tools.md +2 -0
docs/architecture/agents.md +2 -0
docs/architecture/middleware.md +2 -0
docs/architecture/services.md +2 -0
docs/architecture/tools.md +2 -0
docs/contributing/code-quality.md +2 -0
docs/contributing/code-style.md +2 -0
docs/contributing/error-handling.md +2 -0
docs/contributing/implementation-patterns.md +2 -0
docs/contributing/index.md +2 -0
docs/contributing/prompt-engineering.md +2 -0
docs/contributing/testing.md +2 -0
docs/getting-started/examples.md +2 -0
docs/getting-started/installation.md +2 -0
docs/getting-started/mcp-integration.md +2 -0
docs/getting-started/quick-start.md +2 -0
docs/implementation/IMPLEMENTATION_SUMMARY.md +180 -0
docs/implementation/TOKEN_AUTHENTICATION_REVIEW.md +201 -0
docs/implementation/TTS_MODAL_IMPLEMENTATION.md +134 -0
docs/license.md +2 -0
docs/overview/architecture.md +2 -0
docs/overview/features.md +2 -0
docs/team.md +2 -0
new_env.txt +96 -0
pyproject.toml +10 -6
src/agent_factory/judges.py +16 -7
src/app.py +193 -23
src/mcp_tools.py +78 -0
src/middleware/state_machine.py +2 -0
src/orchestrator/graph_orchestrator.py +102 -3
src/orchestrator_factory.py +2 -0
src/services/audio_processing.py +134 -0
src/services/image_ocr.py +242 -0
src/services/llamaindex_rag.py +16 -3
src/services/multimodal_processing.py +136 -0
src/services/stt_gradio.py +271 -0
src/services/tts_modal.py +260 -0
src/tools/crawl_adapter.py +2 -0
src/tools/rag_tool.py +12 -2
src/tools/search_handler.py +6 -1
src/tools/web_search_adapter.py +2 -0
src/utils/config.py +66 -0
src/utils/llm_factory.py +25 -9
tests/unit/middleware/__init__.py +2 -0

.env.example CHANGED Viewed

@@ -11,6 +11,52 @@ ANTHROPIC_API_KEY=sk-ant-your-key-here
 # ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
 # OPENAI_MODEL=gpt-5.1
 # ============== EMBEDDINGS ==============
 # OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
@@ -39,6 +85,14 @@ MAX_ITERATIONS=10
 SEARCH_TIMEOUT=30
 LOG_LEVEL=INFO
 # ============== EXTERNAL SERVICES ==============
 # PubMed (optional - higher rate limits)

 # ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
 # OPENAI_MODEL=gpt-5.1
+# ============================================
+# Audio Processing Configuration (TTS)
+# ============================================
+# Kokoro TTS Model Configuration
+TTS_MODEL=hexgrad/Kokoro-82M
+TTS_VOICE=af_heart
+TTS_SPEED=1.0
+TTS_GPU=T4
+TTS_TIMEOUT=60
+# Available TTS Voices:
+# American English Female: af_heart, af_bella, af_nicole, af_aoede, af_kore, af_sarah, af_nova, af_sky, af_alloy, af_jessica, af_river
+# American English Male: am_michael, am_fenrir, am_puck, am_echo, am_eric, am_liam, am_onyx, am_santa, am_adam
+# Available GPU Types (Modal):
+# T4 - Cheapest, good for testing (default)
+# A10 - Good balance of cost/performance
+# A100 - Fastest, most expensive
+# L4 - NVIDIA L4 GPU
+# L40S - NVIDIA L40S GPU
+# Note: GPU type is set at function definition time. Changes require app restart.
+# ============================================
+# Audio Processing Configuration (STT)
+# ============================================
+# Speech-to-Text API Configuration
+STT_API_URL=nvidia/canary-1b-v2
+STT_SOURCE_LANG=English
+STT_TARGET_LANG=English
+# Available STT Languages:
+# English, Bulgarian, Croatian, Czech, Danish, Dutch, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish, Russian, Ukrainian
+# ============================================
+# Audio Feature Flags
+# ============================================
+ENABLE_AUDIO_INPUT=true
+ENABLE_AUDIO_OUTPUT=true
+# ============================================
+# Image OCR Configuration
+# ============================================
+OCR_API_URL=prithivMLmods/Multimodal-OCR3
+ENABLE_IMAGE_INPUT=true
 # ============== EMBEDDINGS ==============
 # OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
 SEARCH_TIMEOUT=30
 LOG_LEVEL=INFO
+# ============================================
+# Modal Configuration (Required for TTS)
+# ============================================
+# Modal credentials are required for TTS (Text-to-Speech) functionality
+# Get your credentials from: https://modal.com/
+MODAL_TOKEN_ID=your_modal_token_id_here
+MODAL_TOKEN_SECRET=your_modal_token_secret_here
 # ============== EXTERNAL SERVICES ==============
 # PubMed (optional - higher rate limits)

WEB_SEARCH_TOOL_ASSESSMENT.md ADDED Viewed

	@@ -0,0 +1,239 @@

+# Web Search Tool Assessment
+## Executive Summary
+The application has **two separate web search implementations** with different readiness levels:
+1. **`WebSearchTool`** (`src/tools/web_search.py`) - **Partially Ready** ⚠️
+   - Functional but **NOT compliant** with `SearchTool` protocol
+   - **NOT integrated** into main search handler
+   - Only used in magentic orchestrator's retrieval agent
+2. **`web_search_adapter`** (`src/tools/web_search_adapter.py`) - **Functional** ✅
+   - Used by tool executor for WebSearchAgent tasks
+   - Relies on legacy `folder/tools/web_search.py` implementation
+## Detailed Analysis
+### 1. WebSearchTool (`src/tools/web_search.py`)
+#### Current Implementation
+- **Location**: `src/tools/web_search.py`
+- **Provider**: DuckDuckGo (no API key required)
+- **Status**: ⚠️ **Partially Ready**
+#### Issues Identified
+**❌ Protocol Non-Compliance:**
+```python
+# Missing required 'name' property
+class WebSearchTool:
+    # Should have: @property def name(self) -> str: return "web"
+    # Wrong return type - should return list[Evidence], not SearchResult
+    async def search(self, query: str, max_results: int = 10) -> SearchResult:
+        # Returns SearchResult instead of list[Evidence]
+```
+**Comparison with other tools:**
+- `PubMedTool` has `@property def name(self) -> str: return "pubmed"`
+- `PubMedTool.search()` returns `list[Evidence]`
+- `WebSearchTool` returns `SearchResult` (contains `evidence` list inside)
+**❌ Not Integrated:**
+- **NOT** included in `SearchHandler` initialization in `src/app.py`:
+  ```python
+  search_handler = SearchHandler(
+      tools=[PubMedTool(), ClinicalTrialsTool(), EuropePMCTool()],
+      # WebSearchTool() is missing!
+  )
+  ```
+**✅ Current Usage:**
+- Used in `src/agents/retrieval_agent.py` (magentic orchestrator):
+  ```python
+  from src.tools.web_search import WebSearchTool
+  _web_search = WebSearchTool()
+  ```
+#### Fix Required
+To make `WebSearchTool` compliant and usable:
+1. **Add `name` property:**
+   ```python
+   @property
+   def name(self) -> str:
+       return "web"
+   ```
+2. **Fix return type:**
+   ```python
+   async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
+       # ... existing code ...
+       return evidence  # Return list[Evidence] directly, not SearchResult
+   ```
+3. **Register in SearchHandler:**
+   ```python
+   from src.tools.web_search import WebSearchTool
+   search_handler = SearchHandler(
+       tools=[
+           PubMedTool(),
+           ClinicalTrialsTool(),
+           EuropePMCTool(),
+           WebSearchTool()  # Add this
+       ],
+   )
+   ```
+---
+### 2. web_search_adapter (`src/tools/web_search_adapter.py`)
+#### Current Implementation
+- **Location**: `src/tools/web_search_adapter.py`
+- **Status**: ✅ **Functional**
+- **Provider**: Uses legacy `folder/tools/web_search.py` (Serper/SearchXNG)
+#### Usage
+- Used by `src/tools/tool_executor.py` for `WebSearchAgent` tasks:
+  ```python
+  if task.agent == "WebSearchAgent":
+      result_text = await web_search(task.query)
+  ```
+- Used by `src/orchestrator/planner_agent.py` for background context
+#### Dependencies
+- Requires `folder/tools/web_search.py` (legacy implementation)
+- Supports Serper API (requires `SERPER_API_KEY`)
+- Supports SearchXNG API (requires `SEARCHXNG_HOST`)
+#### Limitations
+- Returns formatted string (not `Evidence` objects)
+- Not integrated with `SearchHandler` (different execution path)
+- Depends on legacy folder structure
+---
+## Integration Status
+### SearchHandler Integration
+**Current State**: ❌ **NOT Integrated**
+The main `SearchHandler` in `src/app.py` only includes:
+- `PubMedTool()`
+- `ClinicalTrialsTool()`
+- `EuropePMCTool()`
+**WebSearchTool is missing from the main search flow.**
+### Tool Executor Integration
+**Current State**: ✅ **Integrated**
+`web_search_adapter` is used via `tool_executor.py`:
+- Executes when `AgentTask.agent == "WebSearchAgent"`
+- Used in iterative/deep research flows
+- Returns formatted text (not Evidence objects)
+### Magentic Orchestrator Integration
+**Current State**: ✅ **Integrated**
+`WebSearchTool` is used in `retrieval_agent.py`:
+- Direct instantiation: `_web_search = WebSearchTool()`
+- Used via `search_web()` function
+- Updates workflow state with evidence
+---
+## Can It Be Used?
+### WebSearchTool (`src/tools/web_search.py`)
+**Status**: ⚠️ **Can be used, but with limitations**
+**Can be used:**
+- ✅ In magentic orchestrator (already working)
+- ✅ As standalone tool (functional)
+**Cannot be used:**
+- ❌ In `SearchHandler` (protocol non-compliance)
+- ❌ In parallel search flows (not registered)
+**To make fully usable:**
+1. Fix protocol compliance (add `name`, fix return type)
+2. Register in `SearchHandler`
+3. Test integration
+### web_search_adapter
+**Status**: ✅ **Can be used**
+**Can be used:**
+- ✅ Via `tool_executor` for WebSearchAgent tasks
+- ✅ In planner agent for background context
+- ✅ In iterative/deep research flows
+**Limitations:**
+- Returns string format (not Evidence objects)
+- Requires legacy folder dependencies
+- Different execution path than SearchHandler
+---
+## Recommendations
+### Priority 1: Fix WebSearchTool Protocol Compliance
+Make `WebSearchTool` fully compliant with `SearchTool` protocol:
+1. Add `name` property
+2. Change return type from `SearchResult` to `list[Evidence]`
+3. Update all callers if needed
+### Priority 2: Integrate into SearchHandler
+Add `WebSearchTool` to main search flow:
+```python
+from src.tools.web_search import WebSearchTool
+search_handler = SearchHandler(
+    tools=[
+        PubMedTool(),
+        ClinicalTrialsTool(),
+        EuropePMCTool(),
+        WebSearchTool()  # Add web search
+    ],
+)
+```
+### Priority 3: Consolidate Implementations
+Consider consolidating the two implementations:
+- Keep `WebSearchTool` as the main implementation
+- Deprecate or migrate `web_search_adapter` usage
+- Remove dependency on `folder/tools/web_search.py`
+### Priority 4: Testing
+Add tests for:
+- Protocol compliance
+- SearchHandler integration
+- Error handling
+- Rate limiting (if needed)
+---
+## Summary Table
+| Component | Status | Protocol Compliant | Integrated | Can Be Used |
+|-----------|--------|-------------------|------------|-------------|
+| `WebSearchTool` | ⚠️ Partial | ❌ No | ❌ No | ⚠️ Limited |
+| `web_search_adapter` | ✅ Functional | N/A | ✅ Yes (tool_executor) | ✅ Yes |
+---
+## Conclusion
+The web search functionality exists in two forms:
+1. **`WebSearchTool`** is functional but needs protocol fixes to be fully integrated
+2. **`web_search_adapter`** is working but uses a different execution path
+**Recommendation**: Fix `WebSearchTool` protocol compliance and integrate it into `SearchHandler` for unified search capabilities across all orchestrators.

docs/api/agents.md CHANGED Viewed

	@@ -268,3 +268,5 @@ def create_input_parser_agent(model: Any \| None = None) -> InputParserAgent
268
269
270


268
269
270
271	+
272	+

docs/api/models.md CHANGED Viewed

	@@ -246,3 +246,5 @@ class BudgetStatus(BaseModel):
246
247
248


246
247
248
249	+
250	+

docs/api/orchestrators.md CHANGED Viewed

	@@ -193,3 +193,5 @@ Runs Magentic orchestration.
193
194
195


193
194
195
196	+
197	+

docs/api/services.md CHANGED Viewed

	@@ -199,3 +199,5 @@ Analyzes a hypothesis using statistical methods.
199
200
201


199
200
201
202	+
203	+

docs/api/tools.md CHANGED Viewed

	@@ -233,3 +233,5 @@ Searches multiple tools in parallel.
233
234
235


233
234
235
236	+
237	+

docs/architecture/agents.md CHANGED Viewed

	@@ -190,3 +190,5 @@ Factory functions:
190
191
192


190
191
192
193	+
194	+

docs/architecture/middleware.md CHANGED Viewed

	@@ -140,3 +140,5 @@ All middleware components use `ContextVar` for thread-safe isolation:
140
141
142


140
141
142
143	+
144	+

docs/architecture/services.md CHANGED Viewed

	@@ -140,3 +140,5 @@ if settings.has_openai_key:
140
141
142


140
141
142
143	+
144	+

docs/architecture/tools.md CHANGED Viewed

	@@ -173,3 +173,5 @@ search_handler = SearchHandler(
173
174
175


173
174
175
176	+
177	+

docs/contributing/code-quality.md CHANGED Viewed

	@@ -79,3 +79,5 @@ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
79
80
81


79
80
81
82	+
83	+

docs/contributing/code-style.md CHANGED Viewed

	@@ -59,3 +59,5 @@ result = await loop.run_in_executor(None, cpu_bound_function, args)
59
60
61


59
60
61
62	+
63	+

docs/contributing/error-handling.md CHANGED Viewed

	@@ -67,3 +67,5 @@ except httpx.HTTPError as e:
67
68
69


67
68
69
70	+
71	+

docs/contributing/implementation-patterns.md CHANGED Viewed

	@@ -82,3 +82,5 @@ def get_embedding_service() -> EmbeddingService:
82
83
84


82
83
84
85	+
86	+

docs/contributing/index.md CHANGED Viewed

	@@ -161,3 +161,5 @@ Thank you for contributing to DeepCritical!
161
162
163


161
162
163
164	+
165	+

docs/contributing/prompt-engineering.md CHANGED Viewed

	@@ -67,3 +67,5 @@ This document outlines prompt engineering guidelines and citation validation rul
67
68
69


67
68
69
70	+
71	+

docs/contributing/testing.md CHANGED Viewed

	@@ -63,3 +63,5 @@ async def test_real_pubmed_search():
63
64
65


63
64
65
66	+
67	+

docs/getting-started/examples.md CHANGED Viewed

	@@ -207,3 +207,5 @@ USE_GRAPH_EXECUTION=true
207
208
209


207
208
209
210	+
211	+

docs/getting-started/installation.md CHANGED Viewed

	@@ -146,3 +146,5 @@ uv run pre-commit install
146
147
148


146
147
148
149	+
150	+

docs/getting-started/mcp-integration.md CHANGED Viewed

	@@ -213,3 +213,5 @@ You can configure multiple DeepCritical instances:
213
214
215


213
214
215
216	+
217	+

docs/getting-started/quick-start.md CHANGED Viewed

	@@ -117,3 +117,5 @@ What are the active clinical trials investigating Alzheimer's disease treatments
117
118
119


117
118
119
120	+
121	+

docs/implementation/IMPLEMENTATION_SUMMARY.md ADDED Viewed

	@@ -0,0 +1,180 @@

+# Multimodal Audio & Image Integration - Implementation Summary
+## ✅ Completed Implementation
+### 1. Configuration System (`src/utils/config.py`)
+- ✅ Added audio configuration fields:
+  - `tts_model`, `tts_voice`, `tts_speed`, `tts_gpu`, `tts_timeout`
+  - `stt_api_url`, `stt_source_lang`, `stt_target_lang`
+  - `enable_audio_input`, `enable_audio_output`
+- ✅ Added image OCR configuration:
+  - `ocr_api_url`, `enable_image_input`
+- ✅ Added property methods: `audio_available`, `image_ocr_available`
+### 2. STT Service (`src/services/stt_gradio.py`)
+- ✅ Gradio Client integration for nvidia/canary-1b-v2
+- ✅ Supports file and numpy array audio input
+- ✅ Async transcription with error handling
+- ✅ Singleton factory pattern
+### 3. TTS Service (`src/services/tts_modal.py`)
+- ✅ **Modal GPU function implementation** following Modal documentation
+- ✅ Kokoro 82M integration via Modal GPU
+- ✅ Module-level function definition with lazy initialization
+- ✅ GPU configuration (T4, A10, A100, L4, L40S)
+- ✅ Async wrapper for TTS synthesis
+- ✅ Error handling and graceful degradation
+### 4. Image OCR Service (`src/services/image_ocr.py`)
+- ✅ Gradio Client integration for prithivMLmods/Multimodal-OCR3
+- ✅ Supports image files and PIL/numpy arrays
+- ✅ Text extraction from API results
+- ✅ Singleton factory pattern
+### 5. Unified Services
+- ✅ `src/services/audio_processing.py` - Audio service layer
+- ✅ `src/services/multimodal_processing.py` - Multimodal service layer
+### 6. ChatInterface Integration (`src/app.py`)
+- ✅ Enabled `multimodal=True` for MultimodalTextbox
+- ✅ Added Audio output component
+- ✅ Integrated STT/TTS/OCR into research flow
+- ✅ Multimodal input processing (text + images + audio)
+- ✅ TTS output generation for final responses
+- ✅ **Configuration UI in Settings Accordion**:
+  - Voice dropdown (20+ Kokoro voices)
+  - Speed slider (0.5x to 2.0x)
+  - GPU dropdown (T4, A10, A100, L4, L40S) - read-only, requires restart
+  - Enable audio output checkbox
+- ✅ Configuration values passed from UI to TTS service
+### 7. MCP Integration (`src/mcp_tools.py`)
+- ✅ Added `extract_text_from_image` MCP tool
+- ✅ Added `transcribe_audio_file` MCP tool
+- ✅ Enabled MCP server in app launch
+### 8. Dependencies (`pyproject.toml`)
+- ✅ Added audio dependencies (gradio-client, soundfile, Pillow)
+- ✅ Added TTS optional dependencies (torch, transformers)
+- ✅ Installed via `uv add --optional`
+## 🔧 Modal GPU Implementation Details
+### Function Definition Pattern
+The Modal GPU function is defined using Modal's recommended pattern:
+```python
+@app.function(
+    image=tts_image,  # Image with Kokoro dependencies
+    gpu="T4",  # GPU type from settings.tts_gpu
+    timeout=60,  # Timeout from settings.tts_timeout
+)
+def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
+    """Modal GPU function for Kokoro TTS."""
+    from kokoro import KModel, KPipeline
+    import torch
+    model = KModel().to("cuda").eval()
+    pipeline = KPipeline(lang_code=voice[0])
+    pack = pipeline.load_voice(voice)
+    for _, ps, _ in pipeline(text, voice, speed):
+        ref_s = pack[len(ps) - 1]
+        audio = model(ps, ref_s, speed)
+        return (24000, audio.numpy())
+```
+### Key Implementation Points
+1. **Module-Level Definition**: Function defined inside `_setup_modal_function()` but attached to app instance
+2. **Lazy Initialization**: Function set up on first use
+3. **GPU Configuration**: Set at function definition time (requires restart to change)
+4. **Runtime Parameters**: Voice and speed can be changed at runtime via UI
+## 🔗 Configuration Flow
+### Settings → Implementation
+1. `settings.tts_voice` → Default voice (used if UI not configured)
+2. `settings.tts_speed` → Default speed (used if UI not configured)
+3. `settings.tts_gpu` → GPU type (set at function definition, requires restart)
+4. `settings.tts_timeout` → Timeout (set at function definition)
+### UI → Implementation
+1. Voice dropdown → `tts_voice` parameter → `AudioService.generate_audio_output()`
+2. Speed slider → `tts_speed` parameter → `AudioService.generate_audio_output()`
+3. GPU dropdown → Informational only (changes require restart)
+4. Enable checkbox → `settings.enable_audio_output` → Controls TTS generation
+### Implementation → Modal
+1. `TTSService.synthesize_async()` → Calls Modal GPU function
+2. Modal function executes on GPU → Returns audio tuple
+3. Audio tuple → Gradio Audio component → User hears response
+## 📋 Configuration Points in UI
+### Settings Accordion Components
+Located in `src/app.py` lines 667-712:
+1. **Voice Dropdown** (`tts_voice_dropdown`)
+   - 20+ Kokoro voices
+   - Default: `settings.tts_voice`
+   - Connected to `research_agent()` function
+2. **Speed Slider** (`tts_speed_slider`)
+   - Range: 0.5 to 2.0
+   - Step: 0.1
+   - Default: `settings.tts_speed`
+   - Connected to `research_agent()` function
+3. **GPU Dropdown** (`tts_gpu_dropdown`)
+   - Choices: T4, A10, A100, L4, L40S
+   - Default: `settings.tts_gpu or "T4"`
+   - Read-only (interactive=False)
+   - Note: Changes require app restart
+4. **Enable Audio Output** (`enable_audio_output_checkbox`)
+   - Default: `settings.enable_audio_output`
+   - Controls whether TTS is generated
+## 🎯 Usage Flow
+1. User opens Settings accordion
+2. Configures TTS voice and speed (optional)
+3. Submits query (text, image, or audio)
+4. Research agent processes query
+5. Final response generated
+6. If audio output enabled:
+   - `AudioService.generate_audio_output()` called
+   - Uses UI-configured voice/speed or settings defaults
+   - Modal GPU function synthesizes audio
+   - Audio displayed in Audio component
+## 📝 Notes
+- **GPU Changes**: GPU type is set at Modal function definition time. Changes to `settings.tts_gpu` or UI dropdown require app restart.
+- **Voice/Speed Changes**: Can be changed at runtime via UI - no restart required.
+- **Graceful Degradation**: If TTS fails, application continues with text-only response.
+- **Modal Credentials**: Required for TTS. If not configured, TTS service unavailable (graceful fallback).
+## ✅ Verification Checklist
+- [x] Modal GPU function correctly defined with `@app.function` decorator
+- [x] GPU parameter set from `settings.tts_gpu`
+- [x] Timeout parameter set from `settings.tts_timeout`
+- [x] Voice parameter passed from UI dropdown
+- [x] Speed parameter passed from UI slider
+- [x] Configuration UI elements in Settings accordion
+- [x] Configuration values connected to implementation
+- [x] Dependencies installed via uv
+- [x] Error handling and graceful degradation
+- [x] MCP tools added for audio/image processing
+## 🚀 Next Steps
+1. Test TTS with Modal credentials configured
+2. Verify GPU function execution on Modal
+3. Test voice and speed changes at runtime
+4. Add unit tests for services
+5. Add integration tests for Modal TTS

docs/implementation/TOKEN_AUTHENTICATION_REVIEW.md ADDED Viewed

	@@ -0,0 +1,201 @@

+# Token Authentication Review - Gradio & HuggingFace
+## Summary
+This document reviews the implementation of token authentication for Gradio Client API calls and HuggingFace API usage to ensure tokens are always passed correctly.
+## ✅ Implementation Status
+### 1. Gradio Client Services
+#### STT Service (`src/services/stt_gradio.py`)
+- ✅ **Token Support**: Service accepts `hf_token` parameter in `__init__` and methods
+- ✅ **Client Initialization**: `Client` is created with `hf_token` parameter when token is available
+- ✅ **Token Priority**: Method-level token > instance-level token
+- ✅ **Token Updates**: Client is recreated if token changes
+**Implementation Pattern:**
+```python
+async def _get_client(self, hf_token: str | None = None) -> Client:
+    token = hf_token or self.hf_token
+    if token:
+        self.client = Client(self.api_url, hf_token=token)
+    else:
+        self.client = Client(self.api_url)
+```
+#### Image OCR Service (`src/services/image_ocr.py`)
+- ✅ **Token Support**: Service accepts `hf_token` parameter in `__init__` and methods
+- ✅ **Client Initialization**: `Client` is created with `hf_token` parameter when token is available
+- ✅ **Token Priority**: Method-level token > instance-level token
+- ✅ **Token Updates**: Client is recreated if token changes
+**Same pattern as STT Service**
+### 2. Service Layer Integration
+#### Audio Service (`src/services/audio_processing.py`)
+- ✅ **Token Passthrough**: `process_audio_input()` accepts `hf_token` and passes to STT service
+- ✅ **Token Flow**: `audio_service.process_audio_input(audio, hf_token=token)`
+#### Multimodal Service (`src/services/multimodal_processing.py`)
+- ✅ **Token Passthrough**: `process_multimodal_input()` accepts `hf_token` and passes to both audio and OCR services
+- ✅ **Token Flow**: `multimodal_service.process_multimodal_input(..., hf_token=token)`
+### 3. Application Layer (`src/app.py`)
+#### Token Extraction
+- ✅ **OAuth Token**: Extracted from `gr.OAuthToken` via `oauth_token.token`
+- ✅ **Fallback**: Uses `HF_TOKEN` or `HUGGINGFACE_API_KEY` from environment
+- ✅ **Token Priority**: `oauth_token > HF_TOKEN > HUGGINGFACE_API_KEY`
+**Implementation:**
+```python
+token_value: str | None = None
+if oauth_token is not None:
+    token_value = oauth_token.token if hasattr(oauth_token, "token") else None
+# Fallback to env vars
+effective_token = token_value or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
+```
+#### Token Usage in Services
+- ✅ **Multimodal Processing**: Token passed to `process_multimodal_input(..., hf_token=token_value)`
+- ✅ **Consistent Usage**: Token is extracted once and passed through all service layers
+### 4. HuggingFace API Integration
+#### LLM Factory (`src/utils/llm_factory.py`)
+- ✅ **Token Priority**: `oauth_token > settings.hf_token > settings.huggingface_api_key`
+- ✅ **Provider Usage**: `HuggingFaceProvider(api_key=effective_hf_token)`
+- ✅ **Model Usage**: `HuggingFaceModel(model_name, provider=provider)`
+#### Judge Handler (`src/agent_factory/judges.py`)
+- ✅ **Token Priority**: `oauth_token > settings.hf_token > settings.huggingface_api_key`
+- ✅ **InferenceClient**: `InferenceClient(api_key=api_key)` when token provided
+- ✅ **Fallback**: Uses `HF_TOKEN` from environment if no token provided
+**Implementation:**
+```python
+effective_hf_token = oauth_token or settings.hf_token or settings.huggingface_api_key
+hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
+```
+### 5. MCP Tools (`src/mcp_tools.py`)
+#### Image OCR Tool
+- ✅ **Token Support**: `extract_text_from_image()` accepts `hf_token` parameter
+- ✅ **Token Fallback**: Uses `settings.hf_token` or `settings.huggingface_api_key` if not provided
+- ✅ **Service Integration**: Passes token to `ImageOCRService.extract_text()`
+#### Audio Transcription Tool
+- ✅ **Token Support**: `transcribe_audio_file()` accepts `hf_token` parameter
+- ✅ **Token Fallback**: Uses `settings.hf_token` or `settings.huggingface_api_key` if not provided
+- ✅ **Service Integration**: Passes token to `STTService.transcribe_file()`
+## Token Flow Diagram
+```
+User Login (OAuth)
+    ↓
+oauth_token.token
+    ↓
+app.py: token_value
+    ↓
+┌─────────────────────────────────────┐
+│  Service Layer                       │
+├─────────────────────────────────────┤
+│  MultimodalService                   │
+│    ↓ hf_token=token_value            │
+│  AudioService                        │
+│    ↓ hf_token=token_value            │
+│  STTService / ImageOCRService        │
+│    ↓ hf_token=token_value            │
+│  Gradio Client(hf_token=token)       │
+└─────────────────────────────────────┘
+Alternative: Environment Variables
+    ↓
+HF_TOKEN or HUGGINGFACE_API_KEY
+    ↓
+settings.hf_token or settings.huggingface_api_key
+    ↓
+Same service flow as above
+```
+## Verification Checklist
+- [x] STT Service accepts and uses `hf_token` parameter
+- [x] Image OCR Service accepts and uses `hf_token` parameter
+- [x] Audio Service passes token to STT service
+- [x] Multimodal Service passes token to both audio and OCR services
+- [x] App.py extracts OAuth token correctly
+- [x] App.py passes token to multimodal service
+- [x] HuggingFace API calls use token via `HuggingFaceProvider`
+- [x] HuggingFace API calls use token via `InferenceClient`
+- [x] MCP tools accept and use token parameter
+- [x] Token priority is consistent: OAuth > Env Vars
+- [x] Fallback to environment variables when OAuth not available
+## Token Parameter Naming
+All services consistently use `hf_token` parameter name:
+- `STTService.transcribe_audio(..., hf_token=...)`
+- `STTService.transcribe_file(..., hf_token=...)`
+- `ImageOCRService.extract_text(..., hf_token=...)`
+- `ImageOCRService.extract_text_from_image(..., hf_token=...)`
+- `AudioService.process_audio_input(..., hf_token=...)`
+- `MultimodalService.process_multimodal_input(..., hf_token=...)`
+- `extract_text_from_image(..., hf_token=...)` (MCP tool)
+- `transcribe_audio_file(..., hf_token=...)` (MCP tool)
+## Gradio Client API Usage
+According to Gradio documentation, the `Client` constructor accepts:
+```python
+Client(space_name, hf_token=None)
+```
+Our implementation correctly uses:
+```python
+Client(self.api_url, hf_token=token)  # When token available
+Client(self.api_url)  # When no token (public Space)
+```
+## HuggingFace API Usage
+### HuggingFaceProvider
+```python
+HuggingFaceProvider(api_key=effective_hf_token)
+```
+✅ Correctly passes token as `api_key` parameter
+### InferenceClient
+```python
+InferenceClient(api_key=api_key)  # When token provided
+InferenceClient()  # Falls back to HF_TOKEN env var
+```
+✅ Correctly passes token as `api_key` parameter
+## Edge Cases Handled
+1. **No Token Available**: Services work without token (public Gradio Spaces)
+2. **Token Changes**: Client is recreated when token changes
+3. **OAuth vs Env**: OAuth token takes priority over environment variables
+4. **Multiple Token Sources**: Consistent priority across all services
+5. **MCP Tools**: Support both explicit token and fallback to settings
+## Recommendations
+✅ **All implementations are correct and consistent**
+The token authentication is properly implemented throughout:
+- Gradio Client services accept and use tokens
+- Service layer passes tokens through correctly
+- Application layer extracts and passes OAuth tokens
+- HuggingFace API calls use tokens via correct parameters
+- MCP tools support token authentication
+- Token priority is consistent across all layers
+No changes needed - implementation follows best practices.

docs/implementation/TTS_MODAL_IMPLEMENTATION.md ADDED Viewed

	@@ -0,0 +1,134 @@

+# TTS Modal GPU Implementation
+## Overview
+The TTS (Text-to-Speech) service uses Kokoro 82M model running on Modal's GPU infrastructure. This document describes the implementation details and configuration.
+## Implementation Details
+### Modal GPU Function Pattern
+The implementation follows Modal's recommended pattern for GPU functions:
+1. **Module-Level Function Definition**: Modal functions must be defined at module level and attached to an app instance
+2. **Lazy Initialization**: The function is set up on first use via `_setup_modal_function()`
+3. **GPU Configuration**: GPU type is set at function definition time (requires app restart to change)
+### Key Files
+- `src/services/tts_modal.py` - Modal GPU executor for Kokoro TTS
+- `src/services/audio_processing.py` - Unified audio service wrapper
+- `src/utils/config.py` - Configuration settings
+- `src/app.py` - UI integration with settings accordion
+### Configuration Options
+All TTS configuration is available in `src/utils/config.py`:
+```python
+tts_model: str = "hexgrad/Kokoro-82M"  # Model ID
+tts_voice: str = "af_heart"  # Voice ID
+tts_speed: float = 1.0  # Speed multiplier (0.5-2.0)
+tts_gpu: str = "T4"  # GPU type (T4, A10, A100, etc.)
+tts_timeout: int = 60  # Timeout in seconds
+enable_audio_output: bool = True  # Enable/disable TTS
+```
+### UI Configuration
+TTS settings are available in the Settings accordion:
+- **Voice Dropdown**: Select from 20+ Kokoro voices (af_heart, af_bella, am_michael, etc.)
+- **Speed Slider**: Adjust speech speed (0.5x to 2.0x)
+- **GPU Dropdown**: Select GPU type (T4, A10, A100, L4, L40S) - visible only if Modal credentials configured
+- **Enable Audio Output**: Toggle TTS generation
+### Modal Function Implementation
+The Modal GPU function is defined as:
+```python
+@app.function(
+    image=tts_image,  # Image with Kokoro dependencies
+    gpu="T4",  # GPU type (from settings.tts_gpu)
+    timeout=60,  # Timeout (from settings.tts_timeout)
+)
+def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
+    """Modal GPU function for Kokoro TTS."""
+    from kokoro import KModel, KPipeline
+    import torch
+    model = KModel().to("cuda").eval()
+    pipeline = KPipeline(lang_code=voice[0])
+    pack = pipeline.load_voice(voice)
+    for _, ps, _ in pipeline(text, voice, speed):
+        ref_s = pack[len(ps) - 1]
+        audio = model(ps, ref_s, speed)
+        return (24000, audio.numpy())
+```
+### Usage Flow
+1. User submits query with audio output enabled
+2. Research agent processes query and generates text response
+3. `AudioService.generate_audio_output()` is called with:
+   - Response text
+   - Voice (from UI dropdown or settings default)
+   - Speed (from UI slider or settings default)
+4. `TTSService.synthesize_async()` calls Modal GPU function
+5. Modal executes Kokoro TTS on GPU
+6. Audio tuple `(sample_rate, audio_array)` is returned
+7. Audio is displayed in Gradio Audio component
+### Dependencies
+Installed via `uv add --optional`:
+- `gradio-client>=1.0.0` - For STT/OCR API calls
+- `soundfile>=0.12.0` - For audio file I/O
+- `Pillow>=10.0.0` - For image processing
+Kokoro is installed in Modal image from source:
+- `git+https://github.com/hexgrad/kokoro.git`
+### GPU Types
+Modal supports various GPU types:
+- **T4**: Cheapest, good for testing (default)
+- **A10**: Good balance of cost/performance
+- **A100**: Fastest, most expensive
+- **L4**: NVIDIA L4 GPU
+- **L40S**: NVIDIA L40S GPU
+**Note**: GPU type is set at function definition time. Changes to `settings.tts_gpu` require app restart.
+### Error Handling
+- If Modal credentials not configured: TTS service unavailable (graceful degradation)
+- If Kokoro import fails: ConfigurationError raised
+- If synthesis fails: Returns None, logs warning, continues without audio
+- If GPU unavailable: Modal will queue or fail with clear error message
+### Configuration Connection
+1. **Settings → Implementation**: `settings.tts_voice`, `settings.tts_speed` used as defaults
+2. **UI → Implementation**: UI dropdowns/sliders passed to `research_agent()` function
+3. **Implementation → Modal**: Voice and speed passed to Modal GPU function
+4. **GPU Configuration**: Set at function definition time (requires restart to change)
+### Testing
+To test TTS:
+1. Ensure Modal credentials configured (`MODAL_TOKEN_ID`, `MODAL_TOKEN_SECRET`)
+2. Enable audio output in settings
+3. Submit a query
+4. Check audio output component for generated speech
+### References
+- [Kokoro TTS Space](https://huggingface.co/spaces/hexgrad/Kokoro-TTS) - Reference implementation
+- [Modal GPU Documentation](https://modal.com/docs/guide/gpu) - Modal GPU usage
+- [Kokoro GitHub](https://github.com/hexgrad/kokoro) - Source code

docs/license.md CHANGED Viewed

	@@ -37,3 +37,5 @@ SOFTWARE.
37
38
39


37
38
39
40	+
41	+

docs/overview/architecture.md CHANGED Viewed

	@@ -194,3 +194,5 @@ The system supports complex research workflows through:
194
195
196


194
195
196
197	+
198	+

docs/overview/features.md CHANGED Viewed

	@@ -146,3 +146,5 @@ DeepCritical provides a comprehensive set of features for AI-assisted research:
146
147
148


146
147
148
149	+
150	+

docs/team.md CHANGED Viewed

	@@ -42,3 +42,5 @@ We welcome contributions! See the [Contributing Guide](contributing/index.md) fo
42
43
44


42
43
44
45	+
46	+

new_env.txt ADDED Viewed

	@@ -0,0 +1,96 @@

+# ============================================
+# DeepCritical - New Environment Variables
+# ============================================
+# Add these to your .env file for multimodal audio/image support
+# ============================================
+# ============================================
+# Audio Processing Configuration (TTS)
+# ============================================
+# Kokoro TTS Model Configuration
+TTS_MODEL=hexgrad/Kokoro-82M
+TTS_VOICE=af_heart
+TTS_SPEED=1.0
+TTS_GPU=T4
+TTS_TIMEOUT=60
+# Available TTS Voices:
+# American English Female: af_heart, af_bella, af_nicole, af_aoede, af_kore, af_sarah, af_nova, af_sky, af_alloy, af_jessica, af_river
+# American English Male: am_michael, am_fenrir, am_puck, am_echo, am_eric, am_liam, am_onyx, am_santa, am_adam
+# Available GPU Types (Modal):
+# T4 - Cheapest, good for testing (default)
+# A10 - Good balance of cost/performance
+# A100 - Fastest, most expensive
+# L4 - NVIDIA L4 GPU
+# L40S - NVIDIA L40S GPU
+# Note: GPU type is set at function definition time. Changes require app restart.
+# ============================================
+# Audio Processing Configuration (STT)
+# ============================================
+# Speech-to-Text API Configuration
+STT_API_URL=nvidia/canary-1b-v2
+STT_SOURCE_LANG=English
+STT_TARGET_LANG=English
+# Available STT Languages:
+# English, Bulgarian, Croatian, Czech, Danish, Dutch, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish, Russian, Ukrainian
+# ============================================
+# Audio Feature Flags
+# ============================================
+ENABLE_AUDIO_INPUT=true
+ENABLE_AUDIO_OUTPUT=true
+# ============================================
+# Image OCR Configuration
+# ============================================
+OCR_API_URL=prithivMLmods/Multimodal-OCR3
+ENABLE_IMAGE_INPUT=true
+# ============================================
+# Modal Configuration (Required for TTS)
+# ============================================
+# Modal credentials are required for TTS (Text-to-Speech) functionality
+# Get your credentials from: https://modal.com/
+MODAL_TOKEN_ID=your_modal_token_id_here
+MODAL_TOKEN_SECRET=your_modal_token_secret_here
+# ============================================
+# Existing Environment Variables (for reference)
+# ============================================
+# These are already documented elsewhere, but included for completeness:
+# LLM API Keys (for research agent)
+# OPENAI_API_KEY=your_openai_key
+# ANTHROPIC_API_KEY=your_anthropic_key
+# HF_TOKEN=your_huggingface_token
+# HUGGINGFACE_API_KEY=your_huggingface_key
+# Embedding Configuration
+# OPENAI_EMBEDDING_MODEL=text-embedding-3-small
+# LOCAL_EMBEDDING_MODEL=all-MiniLM-L6-v2
+# EMBEDDING_PROVIDER=local
+# Search Configuration
+# WEB_SEARCH_PROVIDER=duckduckgo
+# SERPER_API_KEY=your_serper_key
+# BRAVE_API_KEY=your_brave_key
+# TAVILY_API_KEY=your_tavily_key
+# PubMed Configuration
+# NCBI_API_KEY=your_ncbi_key
+# ============================================
+# Usage Instructions
+# ============================================
+# 1. Copy the variables you need to your .env file
+# 2. Replace placeholder values (your_modal_token_id_here, etc.) with actual credentials
+# 3. For TTS to work, you MUST configure MODAL_TOKEN_ID and MODAL_TOKEN_SECRET
+# 4. STT and OCR work without additional API keys (use public Gradio Spaces)
+# 5. GPU type changes require app restart to take effect
+# 6. Voice and speed can be changed at runtime via UI Settings accordion

pyproject.toml CHANGED Viewed

@@ -5,21 +5,16 @@ description = "AI-Native Drug Repurposing Research Agent"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
-    # Core
     "pydantic>=2.7",
     "pydantic-settings>=2.2", # For BaseSettings (config)
     "pydantic-ai>=0.0.16", # Agent framework
-    # AI Providers
     "openai>=1.0.0",
     "anthropic>=0.18.0",
-    # HTTP & Parsing
     "httpx>=0.27", # Async HTTP client (PubMed)
     "beautifulsoup4>=4.12", # HTML parsing
     "xmltodict>=0.13", # PubMed XML -> dict
     "huggingface-hub>=0.20.0", # Hugging Face Inference API
-    # UI
     "gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
-    # Utils
     "python-dotenv>=1.0", # .env loading
     "tenacity>=8.2", # Retry logic
     "structlog>=24.1", # Structured logging
@@ -31,6 +26,15 @@ dependencies = [
     "llama-index-llms-huggingface-api>=0.6.1",
     "llama-index-vector-stores-chroma>=0.5.3",
     "llama-index>=0.14.8",
 ]
 [project.optional-dependencies]
@@ -66,7 +70,7 @@ embeddings = [
 ]
 modal = [
     # Mario's Modal code execution + LlamaIndex RAG
-    "modal>=0.63.0",
     "llama-index>=0.11.0",
     "llama-index-llms-openai",
     "llama-index-embeddings-openai",

 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "pydantic>=2.7",
     "pydantic-settings>=2.2", # For BaseSettings (config)
     "pydantic-ai>=0.0.16", # Agent framework
     "openai>=1.0.0",
     "anthropic>=0.18.0",
     "httpx>=0.27", # Async HTTP client (PubMed)
     "beautifulsoup4>=4.12", # HTML parsing
     "xmltodict>=0.13", # PubMed XML -> dict
     "huggingface-hub>=0.20.0", # Hugging Face Inference API
     "gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
     "python-dotenv>=1.0", # .env loading
     "tenacity>=8.2", # Retry logic
     "structlog>=24.1", # Structured logging
     "llama-index-llms-huggingface-api>=0.6.1",
     "llama-index-vector-stores-chroma>=0.5.3",
     "llama-index>=0.14.8",
+    # Audio/Image processing
+    "gradio-client>=1.0.0", # For STT/OCR API calls
+    "soundfile>=0.12.0", # For audio file I/O
+    "pillow>=10.0.0", # For image processing
+    # TTS dependencies (for Modal GPU TTS)
+    "torch>=2.0.0", # Required by Kokoro TTS
+    "transformers>=4.30.0", # Required by Kokoro TTS
+    "modal>=0.63.0", # Required for TTS GPU execution
+    # Note: Kokoro is installed in Modal image from: git+https://github.com/hexgrad/kokoro.git
 ]
 [project.optional-dependencies]
 ]
 modal = [
     # Mario's Modal code execution + LlamaIndex RAG
+    # Note: modal>=0.63.0 is now in main dependencies for TTS support
     "llama-index>=0.11.0",
     "llama-index-llms-openai",
     "llama-index-embeddings-openai",

src/agent_factory/judges.py CHANGED Viewed

@@ -26,22 +26,28 @@ from src.utils.models import AssessmentDetails, Evidence, JudgeAssessment
 logger = structlog.get_logger()
-def get_model() -> Any:
     """Get the LLM model based on configuration.
     Explicitly passes API keys from settings to avoid requiring
     users to export environment variables manually.
     """
     llm_provider = settings.llm_provider
     if llm_provider == "anthropic":
         provider = AnthropicProvider(api_key=settings.anthropic_api_key)
         return AnthropicModel(settings.anthropic_model, provider=provider)
     if llm_provider == "huggingface":
-        # Free tier - uses HF_TOKEN from environment if available
         model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-        hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
         return HuggingFaceModel(model_name, provider=hf_provider)
     if llm_provider == "openai":
@@ -53,7 +59,7 @@ def get_model() -> Any:
         logger.warning("Unknown LLM provider, defaulting to HuggingFace", provider=llm_provider)
     model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-    hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
     return HuggingFaceModel(model_name, provider=hf_provider)
@@ -176,16 +182,19 @@ class HFInferenceJudgeHandler:
         "HuggingFaceH4/zephyr-7b-beta",  # Fallback (Ungated)
     ]
-    def __init__(self, model_id: str | None = None) -> None:
         """
         Initialize with HF Inference client.
         Args:
             model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
         """
         self.model_id = model_id
-        # Will automatically use HF_TOKEN from env if available
-        self.client = InferenceClient()
         self.call_count = 0
         self.last_question: str | None = None
         self.last_evidence: list[Evidence] | None = None

 logger = structlog.get_logger()
+def get_model(oauth_token: str | None = None) -> Any:
     """Get the LLM model based on configuration.
     Explicitly passes API keys from settings to avoid requiring
     users to export environment variables manually.
+    Args:
+        oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
     """
     llm_provider = settings.llm_provider
+    # Priority: oauth_token > env vars
+    effective_hf_token = oauth_token or settings.hf_token or settings.huggingface_api_key
     if llm_provider == "anthropic":
         provider = AnthropicProvider(api_key=settings.anthropic_api_key)
         return AnthropicModel(settings.anthropic_model, provider=provider)
     if llm_provider == "huggingface":
+        # Free tier - uses OAuth token or HF_TOKEN from environment if available
         model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
+        hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
         return HuggingFaceModel(model_name, provider=hf_provider)
     if llm_provider == "openai":
         logger.warning("Unknown LLM provider, defaulting to HuggingFace", provider=llm_provider)
     model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
+    hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
     return HuggingFaceModel(model_name, provider=hf_provider)
         "HuggingFaceH4/zephyr-7b-beta",  # Fallback (Ungated)
     ]
+    def __init__(
+        self, model_id: str | None = None, api_key: str | None = None
+    ) -> None:
         """
         Initialize with HF Inference client.
         Args:
             model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
+            api_key: Optional HuggingFace API key/token. If None, uses HF_TOKEN from env.
         """
         self.model_id = model_id
+        # Pass api_key to InferenceClient if provided, otherwise it will use HF_TOKEN from env
+        self.client = InferenceClient(api_key=api_key) if api_key else InferenceClient()
         self.call_count = 0
         self.last_question: str | None = None
         self.last_evidence: list[Evidence] | None = None

src/app.py CHANGED Viewed

@@ -5,6 +5,8 @@ from collections.abc import AsyncGenerator
 from typing import Any
 import gradio as gr
 # Try to import HuggingFace support (may not be available in all pydantic-ai versions)
 # According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
@@ -26,6 +28,8 @@ except ImportError:
 from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
 from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.europepmc import EuropePMCTool
 from src.tools.pubmed import PubMedTool
@@ -40,16 +44,20 @@ def configure_orchestrator(
     oauth_token: str | None = None,
     hf_model: str | None = None,
     hf_provider: str | None = None,
 ) -> tuple[Any, str]:
     """
     Create an orchestrator instance.
     Args:
         use_mock: If True, use MockJudgeHandler (no API key needed)
-        mode: Orchestrator mode ("simple" or "advanced")
         oauth_token: Optional OAuth token from HuggingFace login
         hf_model: Selected HuggingFace model ID
         hf_provider: Selected inference provider
     Returns:
         Tuple of (Orchestrator instance, backend_name)
@@ -60,10 +68,14 @@ def configure_orchestrator(
         max_results_per_tool=10,
     )
-    # Create search tools
     search_handler = SearchHandler(
         tools=[PubMedTool(), ClinicalTrialsTool(), EuropePMCTool()],
         timeout=config.search_timeout,
     )
     # Create judge (mock, real, or free tier)
@@ -109,22 +121,30 @@ def configure_orchestrator(
     # 3. Free Tier (HuggingFace Inference) - NO API KEY AVAILABLE
     else:
         # No API key available - use HFInferenceJudgeHandler with public models
-        # Don't use third-party providers (novita, groq, etc.) as they require their own API keys
-        # Use HuggingFace's own inference API with public/ungated models
-        # Pass empty provider to use HuggingFace's default (not third-party providers)
         judge_handler = HFInferenceJudgeHandler(
-            model_id=hf_model,
-            api_key=None,  # No API key - will use public models only
-            provider=None,  # Don't specify provider - use HuggingFace's default
         )
         model_display = hf_model.split("/")[-1] if hf_model else "Default (Public Models)"
         backend_info = f"Free Tier ({model_display} - Public Models Only)"
     orchestrator = create_orchestrator(
         search_handler=search_handler,
         judge_handler=judge_handler,
         config=config,
-        mode=mode,  # type: ignore
     )
     return orchestrator, backend_info
@@ -405,19 +425,23 @@ async def handle_orchestrator_events(
 async def research_agent(
-    message: str,
     history: list[dict[str, Any]],
     mode: str = "simple",
     hf_model: str | None = None,
     hf_provider: str | None = None,
     oauth_token: gr.OAuthToken | None = None,
     oauth_profile: gr.OAuthProfile | None = None,
-) -> AsyncGenerator[dict[str, Any] | list[dict[str, Any]], None]:
     """
     Gradio chat function that runs the research agent.
     Args:
-        message: User's research question
         history: Chat history (Gradio format)
         mode: Orchestrator mode ("simple" or "advanced")
         hf_model: Selected HuggingFace model ID (from dropdown)
@@ -426,8 +450,12 @@ async def research_agent(
         oauth_profile: Gradio OAuth profile (None if user not logged in)
     Yields:
-        ChatMessage objects with metadata for accordion display
     """
     # REQUIRE LOGIN BEFORE USE
     # Extract OAuth token and username using Gradio's OAuth types
     # According to Gradio docs: OAuthToken and OAuthProfile are None if user not logged in
@@ -465,14 +493,37 @@ async def research_agent(
                 "before using this application.\n\n"
                 "The login button is required to access the AI models and research tools."
             ),
-        }
         return
-    if not message.strip():
         yield {
             "role": "assistant",
-            "content": "Please enter a research question.",
-        }
         return
     # Check available keys (use token_value instead of oauth_token)
@@ -501,6 +552,8 @@ async def research_agent(
             oauth_token=token_value,  # Use extracted token value
             hf_model=model_id,  # None will use defaults in configure_orchestrator
             hf_provider=provider_name,  # None will use defaults in configure_orchestrator
         )
         yield {
@@ -508,9 +561,41 @@ async def research_agent(
             "content": f"🧠 **Backend**: {backend_name}\n\n",
         }
-        # Handle orchestrator events
-        async for msg in handle_orchestrator_events(orchestrator, message):
-            yield msg
     except Exception as e:
         # Return error message without metadata to avoid issues during example caching
@@ -521,7 +606,7 @@ async def research_agent(
         yield {
             "role": "assistant",
             "content": f"Error: {error_msg}. Please check your configuration and try again.",
-        }
 def create_demo() -> gr.Blocks:
@@ -566,6 +651,72 @@ def create_demo() -> gr.Blocks:
             ),
         )
         # Hidden text components for model/provider (not dropdowns to avoid value mismatch)
         # These will be empty by default and use defaults in configure_orchestrator
         with gr.Row(visible=False):
@@ -581,11 +732,18 @@ def create_demo() -> gr.Blocks:
                 visible=False,  # Hidden from UI
             )
-        # Chat interface with model/provider selection
         # Examples are provided but will NOT run at startup (cache_examples=False)
         # Users must log in first before using examples or submitting queries
         gr.ChatInterface(
             fn=research_agent,
             title="🧬 DeepCritical",
             description=(
                 "*AI-Powered Drug Repurposing Agent — searches PubMed, "
@@ -593,6 +751,7 @@ def create_demo() -> gr.Blocks:
                 "---\n"
                 "*Research tool only — not for medical advice.*  \n"
                 "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
                 "**⚠️ Authentication Required**: Please **sign in with HuggingFace** above before using this application."
             ),
             examples=[
@@ -606,18 +765,24 @@ def create_demo() -> gr.Blocks:
                     "simple",
                     "Qwen/Qwen3-Next-80B-A3B-Thinking",
                     "",
                 ],
                 [
                     "Is metformin effective for treating cancer? Investigate mechanism of action.",
                     "iterative",
                     "Qwen/Qwen3-235B-A22B-Instruct-2507",
                     "",
                 ],
                 [
                     "Create a comprehensive report on Long COVID treatments including clinical trials, mechanisms, and safety.",
                     "deep",
                     "zai-org/GLM-4.5-Air",
                     "nebius",
                 ],
             ],
             cache_examples=False,  # CRITICAL: Disable example caching to prevent examples from running at startup
@@ -627,9 +792,14 @@ def create_demo() -> gr.Blocks:
                 mode_radio,
                 hf_model_dropdown,
                 hf_provider_dropdown,
                 # Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
                 # when user is logged in - they should NOT be added to additional_inputs
             ],
         )
     return demo  # type: ignore[no-any-return]
@@ -642,7 +812,7 @@ def main() -> None:
         # server_name="0.0.0.0",
         # server_port=7860,
         # share=False,
-        mcp_server=False,
         ssr_mode=False,  # Fix for intermittent loading/hydration issues in HF Spaces
     )

 from typing import Any
 import gradio as gr
+import numpy as np
+from gradio.components.multimodal_textbox import MultimodalPostprocess
 # Try to import HuggingFace support (may not be available in all pydantic-ai versions)
 # According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
 from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
 from src.orchestrator_factory import create_orchestrator
+from src.services.audio_processing import get_audio_service
+from src.services.multimodal_processing import get_multimodal_service
 from src.tools.clinicaltrials import ClinicalTrialsTool
 from src.tools.europepmc import EuropePMCTool
 from src.tools.pubmed import PubMedTool
     oauth_token: str | None = None,
     hf_model: str | None = None,
     hf_provider: str | None = None,
+    graph_mode: str | None = None,
+    use_graph: bool = True,
 ) -> tuple[Any, str]:
     """
     Create an orchestrator instance.
     Args:
         use_mock: If True, use MockJudgeHandler (no API key needed)
+        mode: Orchestrator mode ("simple", "advanced", "iterative", "deep", or "auto")
         oauth_token: Optional OAuth token from HuggingFace login
         hf_model: Selected HuggingFace model ID
         hf_provider: Selected inference provider
+        graph_mode: Graph research mode ("iterative", "deep", or "auto") - used when mode is graph-based
+        use_graph: Whether to use graph execution (True) or agent chains (False)
     Returns:
         Tuple of (Orchestrator instance, backend_name)
         max_results_per_tool=10,
     )
+    # Create search tools with RAG enabled
+    # Pass OAuth token to SearchHandler so it can be used by RAG service
     search_handler = SearchHandler(
         tools=[PubMedTool(), ClinicalTrialsTool(), EuropePMCTool()],
         timeout=config.search_timeout,
+        include_rag=True,
+        auto_ingest_to_rag=True,
+        oauth_token=oauth_token,
     )
     # Create judge (mock, real, or free tier)
     # 3. Free Tier (HuggingFace Inference) - NO API KEY AVAILABLE
     else:
         # No API key available - use HFInferenceJudgeHandler with public models
+        # HFInferenceJudgeHandler will use HF_TOKEN from env if available, otherwise public models
+        # Note: OAuth token should have been caught in effective_api_key check above
+        # If we reach here, we truly have no API key, so use public models
         judge_handler = HFInferenceJudgeHandler(
+            model_id=hf_model if hf_model else None,
+            api_key=None,  # Will use HF_TOKEN from env if available, otherwise public models
         )
         model_display = hf_model.split("/")[-1] if hf_model else "Default (Public Models)"
         backend_info = f"Free Tier ({model_display} - Public Models Only)"
+    # Determine effective mode
+    # If mode is already iterative/deep/auto, use it directly
+    # If mode is "graph" or "simple", use graph_mode if provided
+    effective_mode = mode
+    if mode in ("graph", "simple") and graph_mode:
+        effective_mode = graph_mode
+    elif mode == "graph" and not graph_mode:
+        effective_mode = "auto"  # Default to auto if graph mode but no graph_mode specified
     orchestrator = create_orchestrator(
         search_handler=search_handler,
         judge_handler=judge_handler,
         config=config,
+        mode=effective_mode,  # type: ignore
     )
     return orchestrator, backend_info
 async def research_agent(
+    message: str | MultimodalPostprocess,
     history: list[dict[str, Any]],
     mode: str = "simple",
     hf_model: str | None = None,
     hf_provider: str | None = None,
+    graph_mode: str = "auto",
+    use_graph: bool = True,
+    tts_voice: str = "af_heart",
+    tts_speed: float = 1.0,
     oauth_token: gr.OAuthToken | None = None,
     oauth_profile: gr.OAuthProfile | None = None,
+) -> AsyncGenerator[dict[str, Any] | tuple[dict[str, Any], tuple[int, np.ndarray] | None], None]:
     """
     Gradio chat function that runs the research agent.
     Args:
+        message: User's research question (str or MultimodalPostprocess with text/files)
         history: Chat history (Gradio format)
         mode: Orchestrator mode ("simple" or "advanced")
         hf_model: Selected HuggingFace model ID (from dropdown)
         oauth_profile: Gradio OAuth profile (None if user not logged in)
     Yields:
+        ChatMessage objects with metadata for accordion display, optionally with audio output
     """
+    import structlog
+    logger = structlog.get_logger()
     # REQUIRE LOGIN BEFORE USE
     # Extract OAuth token and username using Gradio's OAuth types
     # According to Gradio docs: OAuthToken and OAuthProfile are None if user not logged in
                 "before using this application.\n\n"
                 "The login button is required to access the AI models and research tools."
             ),
+        }, None
         return
+    # Process multimodal input (text + images + audio)
+    processed_text = ""
+    audio_input_data: tuple[int, np.ndarray] | None = None
+    if isinstance(message, dict):
+        # MultimodalPostprocess format: {"text": str, "files": list[FileData]}
+        processed_text = message.get("text", "") or ""
+        files = message.get("files", [])
+        # Process multimodal input (images, audio files)
+        if files and settings.enable_image_input:
+            try:
+                multimodal_service = get_multimodal_service()
+                processed_text = await multimodal_service.process_multimodal_input(
+                    processed_text, files=files, hf_token=token_value
+                )
+            except Exception as e:
+                logger.warning("multimodal_processing_failed", error=str(e))
+                # Continue with text-only input
+    else:
+        # Plain string message
+        processed_text = str(message) if message else ""
+    if not processed_text.strip():
         yield {
             "role": "assistant",
+            "content": "Please enter a research question or provide an image/audio input.",
+        }, None
         return
     # Check available keys (use token_value instead of oauth_token)
             oauth_token=token_value,  # Use extracted token value
             hf_model=model_id,  # None will use defaults in configure_orchestrator
             hf_provider=provider_name,  # None will use defaults in configure_orchestrator
+            graph_mode=graph_mode if graph_mode else None,
+            use_graph=use_graph,
         )
         yield {
             "content": f"🧠 **Backend**: {backend_name}\n\n",
         }
+        # Handle orchestrator events and generate audio output
+        audio_output_data: tuple[int, np.ndarray] | None = None
+        final_message = ""
+        async for msg in handle_orchestrator_events(orchestrator, processed_text):
+            # Track final message for TTS
+            if isinstance(msg, dict) and msg.get("role") == "assistant":
+                content = msg.get("content", "")
+                metadata = msg.get("metadata", {})
+                # This is the main response (not an accordion) if no title in metadata
+                if content and not metadata.get("title"):
+                    final_message = content
+            # Yield without audio for intermediate messages
+            yield msg, None
+        # Generate audio output for final response
+        if final_message and settings.enable_audio_output:
+            try:
+                audio_service = get_audio_service()
+                # Use UI-configured voice and speed, fallback to settings defaults
+                audio_output_data = await audio_service.generate_audio_output(
+                    final_message,
+                    voice=tts_voice or settings.tts_voice,
+                    speed=tts_speed if tts_speed else settings.tts_speed,
+                )
+            except Exception as e:
+                logger.warning("audio_synthesis_failed", error=str(e))
+                # Continue without audio output
+        # If we have audio output, we need to yield it with the final message
+        # Note: The final message was already yielded above, so we yield None, audio_output_data
+        # This will update the audio output component
+        if audio_output_data is not None:
+            yield None, audio_output_data
     except Exception as e:
         # Return error message without metadata to avoid issues during example caching
         yield {
             "role": "assistant",
             "content": f"Error: {error_msg}. Please check your configuration and try again.",
+        }, None
 def create_demo() -> gr.Blocks:
             ),
         )
+        # Graph mode selection
+        graph_mode_radio = gr.Radio(
+            choices=["iterative", "deep", "auto"],
+            value="auto",
+            label="Graph Research Mode",
+            info="Iterative: Single loop | Deep: Parallel sections | Auto: Detect from query",
+        )
+        # Graph execution toggle
+        use_graph_checkbox = gr.Checkbox(
+            value=True,
+            label="Use Graph Execution",
+            info="Enable graph-based workflow execution",
+        )
+        # TTS Configuration (in Settings accordion)
+        with gr.Accordion("🎤 Audio Settings", open=False, visible=settings.enable_audio_output):
+            tts_voice_dropdown = gr.Dropdown(
+                choices=[
+                    "af_heart",
+                    "af_bella",
+                    "af_nicole",
+                    "af_aoede",
+                    "af_kore",
+                    "af_sarah",
+                    "af_nova",
+                    "af_sky",
+                    "af_alloy",
+                    "af_jessica",
+                    "af_river",
+                    "am_michael",
+                    "am_fenrir",
+                    "am_puck",
+                    "am_echo",
+                    "am_eric",
+                    "am_liam",
+                    "am_onyx",
+                    "am_santa",
+                    "am_adam",
+                ],
+                value=settings.tts_voice,
+                label="Voice",
+                info="Select TTS voice (American English voices: af_*, am_*)",
+            )
+            tts_speed_slider = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=settings.tts_speed,
+                step=0.1,
+                label="Speech Speed",
+                info="Adjust TTS speech speed (0.5x to 2.0x)",
+            )
+            tts_gpu_dropdown = gr.Dropdown(
+                choices=["T4", "A10", "A100", "L4", "L40S"],
+                value=settings.tts_gpu or "T4",
+                label="GPU Type",
+                info="Modal GPU type for TTS (T4 is cheapest, A100 is fastest). Note: GPU changes require app restart.",
+                visible=settings.modal_available,
+                interactive=False,  # GPU type set at function definition time, requires restart
+            )
+            enable_audio_output_checkbox = gr.Checkbox(
+                value=settings.enable_audio_output,
+                label="Enable Audio Output",
+                info="Generate audio responses using TTS",
+            )
         # Hidden text components for model/provider (not dropdowns to avoid value mismatch)
         # These will be empty by default and use defaults in configure_orchestrator
         with gr.Row(visible=False):
                 visible=False,  # Hidden from UI
             )
+        # Audio output component (for TTS response)
+        audio_output = gr.Audio(
+            label="🔊 Audio Response",
+            visible=settings.enable_audio_output,
+        )
+        # Chat interface with multimodal support
         # Examples are provided but will NOT run at startup (cache_examples=False)
         # Users must log in first before using examples or submitting queries
         gr.ChatInterface(
             fn=research_agent,
+            multimodal=True,  # Enable multimodal input (text + images + audio)
             title="🧬 DeepCritical",
             description=(
                 "*AI-Powered Drug Repurposing Agent — searches PubMed, "
                 "---\n"
                 "*Research tool only — not for medical advice.*  \n"
                 "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
+                "**🎤 Multimodal Support**: Upload images (OCR), record audio (STT), or type text.\n\n"
                 "**⚠️ Authentication Required**: Please **sign in with HuggingFace** above before using this application."
             ),
             examples=[
                     "simple",
                     "Qwen/Qwen3-Next-80B-A3B-Thinking",
                     "",
+                    "auto",
+                    True,
                 ],
                 [
                     "Is metformin effective for treating cancer? Investigate mechanism of action.",
                     "iterative",
                     "Qwen/Qwen3-235B-A22B-Instruct-2507",
                     "",
+                    "iterative",
+                    True,
                 ],
                 [
                     "Create a comprehensive report on Long COVID treatments including clinical trials, mechanisms, and safety.",
                     "deep",
                     "zai-org/GLM-4.5-Air",
                     "nebius",
+                    "deep",
+                    True,
                 ],
             ],
             cache_examples=False,  # CRITICAL: Disable example caching to prevent examples from running at startup
                 mode_radio,
                 hf_model_dropdown,
                 hf_provider_dropdown,
+                graph_mode_radio,
+                use_graph_checkbox,
+                tts_voice_dropdown,
+                tts_speed_slider,
                 # Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
                 # when user is logged in - they should NOT be added to additional_inputs
             ],
+            additional_outputs=[audio_output],  # Add audio output for TTS
         )
     return demo  # type: ignore[no-any-return]
         # server_name="0.0.0.0",
         # server_port=7860,
         # share=False,
+        mcp_server=True,  # Enable MCP server for Claude Desktop integration
         ssr_mode=False,  # Fix for intermittent loading/hydration issues in HF Spaces
     )

src/mcp_tools.py CHANGED Viewed

@@ -223,3 +223,81 @@ async def analyze_hypothesis(
 **Executed in Modal Sandbox** - Isolated, secure, reproducible.
 """

 **Executed in Modal Sandbox** - Isolated, secure, reproducible.
 """
+async def extract_text_from_image(
+    image_path: str, model: str | None = None, hf_token: str | None = None
+) -> str:
+    """Extract text from an image using OCR.
+    Uses the Multimodal-OCR3 Gradio Space to extract text from images.
+    Supports various image formats (PNG, JPG, etc.) and can extract text
+    from scanned documents, screenshots, and other image types.
+    Args:
+        image_path: Path to image file
+        model: Optional model selection (default: None, uses API default)
+    Returns:
+        Extracted text from the image
+    """
+    from src.services.image_ocr import get_image_ocr_service
+    from src.utils.config import settings
+    try:
+        ocr_service = get_image_ocr_service()
+        # Use provided token or fallback to env vars
+        token = hf_token or settings.hf_token or settings.huggingface_api_key
+        extracted_text = await ocr_service.extract_text(image_path, model=model, hf_token=token)
+        if not extracted_text:
+            return f"No text found in image: {image_path}"
+        return f"## Extracted Text from Image\n\n{extracted_text}"
+    except Exception as e:
+        return f"Error extracting text from image: {e}"
+async def transcribe_audio_file(
+    audio_path: str,
+    source_lang: str | None = None,
+    target_lang: str | None = None,
+    hf_token: str | None = None,
+) -> str:
+    """Transcribe audio file to text using speech-to-text.
+    Uses the NVIDIA Canary Gradio Space to transcribe audio files.
+    Supports various audio formats (WAV, MP3, etc.) and multiple languages.
+    Args:
+        audio_path: Path to audio file
+        source_lang: Source language (default: "English")
+        target_lang: Target language (default: "English")
+    Returns:
+        Transcribed text from the audio file
+    """
+    from src.services.stt_gradio import get_stt_service
+    from src.utils.config import settings
+    try:
+        stt_service = get_stt_service()
+        # Use provided token or fallback to env vars
+        token = hf_token or settings.hf_token or settings.huggingface_api_key
+        transcribed_text = await stt_service.transcribe_file(
+            audio_path,
+            source_lang=source_lang,
+            target_lang=target_lang,
+            hf_token=token,
+        )
+        if not transcribed_text:
+            return f"No transcription found in audio: {audio_path}"
+        return f"## Audio Transcription\n\n{transcribed_text}"
+    except Exception as e:
+        return f"Error transcribing audio: {e}"

src/middleware/state_machine.py CHANGED Viewed

	@@ -131,3 +131,5 @@ def get_workflow_state() -> WorkflowState:
131
132
133


131
132
133
134	+
135	+

src/orchestrator/graph_orchestrator.py CHANGED Viewed

@@ -28,6 +28,7 @@ from src.agent_factory.graph_builder import (
     create_deep_graph,
     create_iterative_graph,
 )
 from src.middleware.budget_tracker import BudgetTracker
 from src.middleware.state_machine import WorkflowState, init_workflow_state
 from src.orchestrator.research_flow import DeepResearchFlow, IterativeResearchFlow
@@ -121,6 +122,8 @@ class GraphOrchestrator:
         max_iterations: int = 5,
         max_time_minutes: int = 10,
         use_graph: bool = True,
     ) -> None:
         """
         Initialize graph orchestrator.
@@ -130,11 +133,15 @@ class GraphOrchestrator:
             max_iterations: Maximum iterations per loop
             max_time_minutes: Maximum time per loop
             use_graph: Whether to use graph execution (True) or agent chains (False)
         """
         self.mode = mode
         self.max_iterations = max_iterations
         self.max_time_minutes = max_time_minutes
         self.use_graph = use_graph
         self.logger = logger
         # Initialize flows (for backward compatibility)
@@ -248,6 +255,7 @@ class GraphOrchestrator:
                 self._iterative_flow = IterativeResearchFlow(
                     max_iterations=self.max_iterations,
                     max_time_minutes=self.max_time_minutes,
                 )
             try:
@@ -278,6 +286,8 @@ class GraphOrchestrator:
             )
             if self._deep_flow is None:
                 self._deep_flow = DeepResearchFlow(
                     max_iterations=self.max_iterations,
                     max_time_minutes=self.max_time_minutes,
@@ -640,6 +650,34 @@ class GraphOrchestrator:
             tokens = result.usage.total_tokens if hasattr(result.usage, "total_tokens") else 0
             context.budget_tracker.add_tokens("graph_execution", tokens)
         return output
     async def _execute_state_node(
@@ -650,6 +688,7 @@ class GraphOrchestrator:
         Special handling for deep research state nodes:
         - "store_plan": Stores ReportPlan in context for parallel loops
         - "collect_drafts": Stores section drafts in context for synthesizer
         Args:
             node: The state node
@@ -659,6 +698,58 @@ class GraphOrchestrator:
         Returns:
             State update result
         """
         # Get previous result for state update
         # For "store_plan", get from planner node
         # For "collect_drafts", get from parallel_loops node
@@ -797,8 +888,10 @@ class GraphOrchestrator:
             sections=len(report_plan.report_outline),
         )
-        # Create judge handler for iterative flows
-        judge_handler = create_judge_handler()
         # Create and execute iterative research flows for each section
         async def run_section_research(section_index: int) -> str:
@@ -812,7 +905,7 @@ class GraphOrchestrator:
                     max_time_minutes=self.max_time_minutes,
                     verbose=False,  # Less verbose in parallel execution
                     use_graph=False,  # Use agent chains for section research
-                    judge_handler=judge_handler,
                 )
                 # Run research for this section
@@ -953,6 +1046,8 @@ def create_graph_orchestrator(
     max_iterations: int = 5,
     max_time_minutes: int = 10,
     use_graph: bool = True,
 ) -> GraphOrchestrator:
     """
     Factory function to create a graph orchestrator.
@@ -962,6 +1057,8 @@ def create_graph_orchestrator(
         max_iterations: Maximum iterations per loop
         max_time_minutes: Maximum time per loop
         use_graph: Whether to use graph execution (True) or agent chains (False)
     Returns:
         Configured GraphOrchestrator instance
@@ -971,4 +1068,6 @@ def create_graph_orchestrator(
         max_iterations=max_iterations,
         max_time_minutes=max_time_minutes,
         use_graph=use_graph,
     )

     create_deep_graph,
     create_iterative_graph,
 )
+from src.legacy_orchestrator import JudgeHandlerProtocol, SearchHandlerProtocol
 from src.middleware.budget_tracker import BudgetTracker
 from src.middleware.state_machine import WorkflowState, init_workflow_state
 from src.orchestrator.research_flow import DeepResearchFlow, IterativeResearchFlow
         max_iterations: int = 5,
         max_time_minutes: int = 10,
         use_graph: bool = True,
+        search_handler: SearchHandlerProtocol | None = None,
+        judge_handler: JudgeHandlerProtocol | None = None,
     ) -> None:
         """
         Initialize graph orchestrator.
             max_iterations: Maximum iterations per loop
             max_time_minutes: Maximum time per loop
             use_graph: Whether to use graph execution (True) or agent chains (False)
+            search_handler: Optional search handler for tool execution
+            judge_handler: Optional judge handler for evidence assessment
         """
         self.mode = mode
         self.max_iterations = max_iterations
         self.max_time_minutes = max_time_minutes
         self.use_graph = use_graph
+        self.search_handler = search_handler
+        self.judge_handler = judge_handler
         self.logger = logger
         # Initialize flows (for backward compatibility)
                 self._iterative_flow = IterativeResearchFlow(
                     max_iterations=self.max_iterations,
                     max_time_minutes=self.max_time_minutes,
+                    judge_handler=self.judge_handler,
                 )
             try:
             )
             if self._deep_flow is None:
+                # DeepResearchFlow creates its own judge_handler internally
+                # The judge_handler is passed to IterativeResearchFlow in parallel loops
                 self._deep_flow = DeepResearchFlow(
                     max_iterations=self.max_iterations,
                     max_time_minutes=self.max_time_minutes,
             tokens = result.usage.total_tokens if hasattr(result.usage, "total_tokens") else 0
             context.budget_tracker.add_tokens("graph_execution", tokens)
+        # Special handling for knowledge_gap node: optionally call judge_handler
+        if node.node_id == "knowledge_gap" and self.judge_handler:
+            # Get evidence from workflow state
+            evidence = context.state.evidence
+            if evidence:
+                try:
+                    from src.utils.models import JudgeAssessment
+                    # Call judge handler to assess evidence
+                    judge_assessment: JudgeAssessment = await self.judge_handler.assess(
+                        question=query, evidence=evidence
+                    )
+                    # Store assessment in context for decision node to use
+                    context.set_node_result("judge_assessment", judge_assessment)
+                    self.logger.info(
+                        "Judge assessment completed",
+                        sufficient=judge_assessment.sufficient,
+                        confidence=judge_assessment.confidence,
+                        recommendation=judge_assessment.recommendation,
+                    )
+                except Exception as e:
+                    self.logger.warning(
+                        "Judge handler assessment failed",
+                        error=str(e),
+                        node_id=node.node_id,
+                    )
+                    # Continue without judge assessment
         return output
     async def _execute_state_node(
         Special handling for deep research state nodes:
         - "store_plan": Stores ReportPlan in context for parallel loops
         - "collect_drafts": Stores section drafts in context for synthesizer
+        - "execute_tools": Executes search using search_handler
         Args:
             node: The state node
         Returns:
             State update result
         """
+        # Special handling for execute_tools node
+        if node.node_id == "execute_tools":
+            # Get AgentSelectionPlan from tool_selector node result
+            tool_selector_result = context.get_node_result("tool_selector")
+            from src.utils.models import AgentSelectionPlan, SearchResult
+            # Extract query from context or use original query
+            search_query = query
+            if tool_selector_result and isinstance(tool_selector_result, AgentSelectionPlan):
+                # Use the gap or query from the selection plan
+                if tool_selector_result.tasks:
+                    # Use the first task's query if available
+                    first_task = tool_selector_result.tasks[0]
+                    if hasattr(first_task, "query") and first_task.query:
+                        search_query = first_task.query
+                    elif hasattr(first_task, "tool_input") and isinstance(
+                        first_task.tool_input, str
+                    ):
+                        search_query = first_task.tool_input
+            # Execute search using search_handler
+            if self.search_handler:
+                try:
+                    search_result: SearchResult = await self.search_handler.execute(
+                        query=search_query, max_results_per_tool=10
+                    )
+                    # Add evidence to workflow state (add_evidence expects a list)
+                    context.state.add_evidence(search_result.evidence)
+                    # Store evidence list in context for next nodes
+                    context.set_node_result(node.node_id, search_result.evidence)
+                    self.logger.info(
+                        "Tools executed via search_handler",
+                        query=search_query[:100],
+                        evidence_count=len(search_result.evidence),
+                    )
+                    return search_result.evidence
+                except Exception as e:
+                    self.logger.error(
+                        "Search handler execution failed",
+                        error=str(e),
+                        query=search_query[:100],
+                    )
+                    # Return empty list on error to allow graph to continue
+                    return []
+            else:
+                # Fallback: log warning and return empty list
+                self.logger.warning(
+                    "Search handler not available for execute_tools node",
+                    node_id=node.node_id,
+                )
+                return []
         # Get previous result for state update
         # For "store_plan", get from planner node
         # For "collect_drafts", get from parallel_loops node
             sections=len(report_plan.report_outline),
         )
+        # Use judge handler from GraphOrchestrator if available, otherwise create new one
+        judge_handler = self.judge_handler
+        if judge_handler is None:
+            judge_handler = create_judge_handler()
         # Create and execute iterative research flows for each section
         async def run_section_research(section_index: int) -> str:
                     max_time_minutes=self.max_time_minutes,
                     verbose=False,  # Less verbose in parallel execution
                     use_graph=False,  # Use agent chains for section research
+                    judge_handler=self.judge_handler or judge_handler,
                 )
                 # Run research for this section
     max_iterations: int = 5,
     max_time_minutes: int = 10,
     use_graph: bool = True,
+    search_handler: SearchHandlerProtocol | None = None,
+    judge_handler: JudgeHandlerProtocol | None = None,
 ) -> GraphOrchestrator:
     """
     Factory function to create a graph orchestrator.
         max_iterations: Maximum iterations per loop
         max_time_minutes: Maximum time per loop
         use_graph: Whether to use graph execution (True) or agent chains (False)
+        search_handler: Optional search handler for tool execution
+        judge_handler: Optional judge handler for evidence assessment
     Returns:
         Configured GraphOrchestrator instance
         max_iterations=max_iterations,
         max_time_minutes=max_time_minutes,
         use_graph=use_graph,
+        search_handler=search_handler,
+        judge_handler=judge_handler,
     )

src/orchestrator_factory.py CHANGED Viewed

@@ -81,6 +81,8 @@ def create_orchestrator(
             max_iterations=config.max_iterations if config else 5,
             max_time_minutes=10,
             use_graph=True,
         )
     # Simple mode requires handlers

             max_iterations=config.max_iterations if config else 5,
             max_time_minutes=10,
             use_graph=True,
+            search_handler=search_handler,
+            judge_handler=judge_handler,
         )
     # Simple mode requires handlers

src/services/audio_processing.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Unified audio processing service for STT and TTS integration."""
+from functools import lru_cache
+from typing import Any
+import numpy as np
+import structlog
+from src.services.stt_gradio import STTService, get_stt_service
+from src.utils.config import settings
+from src.utils.exceptions import ConfigurationError
+logger = structlog.get_logger(__name__)
+# Type stub for TTS service (will be imported when available)
+try:
+    from src.services.tts_modal import TTSService, get_tts_service
+    _TTS_AVAILABLE = True
+except ImportError:
+    _TTS_AVAILABLE = False
+    TTSService = None  # type: ignore[assignment, misc]
+    get_tts_service = None  # type: ignore[assignment, misc]
+class AudioService:
+    """Unified audio processing service."""
+    def __init__(
+        self,
+        stt_service: STTService | None = None,
+        tts_service: Any | None = None,
+    ) -> None:
+        """Initialize audio service with STT and TTS.
+        Args:
+            stt_service: STT service instance (default: get_stt_service())
+            tts_service: TTS service instance (default: get_tts_service() if available)
+        """
+        self.stt = stt_service or get_stt_service()
+        # TTS is optional (requires Modal)
+        if tts_service is not None:
+            self.tts = tts_service
+        elif _TTS_AVAILABLE and settings.modal_available:
+            try:
+                self.tts = get_tts_service()  # type: ignore[misc]
+            except Exception as e:
+                logger.warning("tts_service_unavailable", error=str(e))
+                self.tts = None
+        else:
+            self.tts = None
+    async def process_audio_input(
+        self,
+        audio_input: tuple[int, np.ndarray] | None,
+        hf_token: str | None = None,
+    ) -> str | None:
+        """Process audio input and return transcribed text.
+        Args:
+            audio_input: Tuple of (sample_rate, audio_array) or None
+            hf_token: HuggingFace token for authenticated Gradio Spaces
+        Returns:
+            Transcribed text string or None if no audio input
+        """
+        if audio_input is None:
+            return None
+        try:
+            transcribed_text = await self.stt.transcribe_audio(audio_input, hf_token=hf_token)
+            logger.info("audio_input_processed", text_length=len(transcribed_text))
+            return transcribed_text
+        except Exception as e:
+            logger.error("audio_input_processing_failed", error=str(e))
+            # Return None on failure (graceful degradation)
+            return None
+    async def generate_audio_output(
+        self,
+        text: str,
+        voice: str | None = None,
+        speed: float | None = None,
+    ) -> tuple[int, np.ndarray] | None:
+        """Generate audio output from text.
+        Args:
+            text: Text to synthesize
+            voice: Voice ID (default: settings.tts_voice)
+            speed: Speech speed (default: settings.tts_speed)
+        Returns:
+            Tuple of (sample_rate, audio_array) or None if TTS unavailable
+        """
+        if self.tts is None:
+            logger.warning("tts_unavailable", message="TTS service not available")
+            return None
+        if not text or not text.strip():
+            logger.warning("empty_text_for_tts")
+            return None
+        try:
+            # Use provided voice/speed or fallback to settings defaults
+            voice = voice if voice else settings.tts_voice
+            speed = speed if speed is not None else settings.tts_speed
+            audio_output = await self.tts.synthesize_async(text, voice, speed)  # type: ignore[misc]
+            if audio_output:
+                logger.info(
+                    "audio_output_generated",
+                    text_length=len(text),
+                    sample_rate=audio_output[0],
+                )
+            return audio_output
+        except Exception as e:
+            logger.error("audio_output_generation_failed", error=str(e))
+            # Return None on failure (graceful degradation)
+            return None
+@lru_cache(maxsize=1)
+def get_audio_service() -> AudioService:
+    """Get or create singleton audio service instance.
+    Returns:
+        AudioService instance
+    """
+    return AudioService()

src/services/image_ocr.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""Image-to-text service using Gradio Client API (Multimodal-OCR3)."""
+import asyncio
+import tempfile
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+import numpy as np
+import structlog
+from gradio_client import Client, handle_file
+from PIL import Image
+from src.utils.config import settings
+from src.utils.exceptions import ConfigurationError
+logger = structlog.get_logger(__name__)
+class ImageOCRService:
+    """Image OCR service using prithivMLmods/Multimodal-OCR3 Gradio Space."""
+    def __init__(self, api_url: str | None = None, hf_token: str | None = None) -> None:
+        """Initialize Image OCR service.
+        Args:
+            api_url: Gradio Space URL (default: settings.ocr_api_url)
+            hf_token: HuggingFace token for authenticated Spaces (default: None)
+        Raises:
+            ConfigurationError: If API URL not configured
+        """
+        self.api_url = api_url or settings.ocr_api_url
+        if not self.api_url:
+            raise ConfigurationError("OCR API URL not configured")
+        self.hf_token = hf_token
+        self.client: Client | None = None
+    async def _get_client(self, hf_token: str | None = None) -> Client:
+        """Get or create Gradio Client (lazy initialization).
+        Args:
+            hf_token: HuggingFace token for authenticated Spaces (overrides instance token)
+        Returns:
+            Gradio Client instance
+        """
+        # Use provided token or instance token
+        token = hf_token or self.hf_token
+        # If client exists but token changed, recreate it
+        if self.client is not None and token != self.hf_token:
+            self.client = None
+        if self.client is None:
+            loop = asyncio.get_running_loop()
+            # Pass token to Client for authenticated Spaces
+            if token:
+                self.client = await loop.run_in_executor(
+                    None,
+                    lambda: Client(self.api_url, hf_token=token),
+                )
+            else:
+                self.client = await loop.run_in_executor(
+                    None,
+                    lambda: Client(self.api_url),
+                )
+            # Update instance token for future use
+            self.hf_token = token
+        return self.client
+    async def extract_text(
+        self,
+        image_path: str,
+        model: str | None = None,
+        hf_token: str | None = None,
+    ) -> str:
+        """Extract text from image using Gradio API.
+        Args:
+            image_path: Path to image file
+            model: Optional model selection (default: None, uses API default)
+        Returns:
+            Extracted text string
+        Raises:
+            ConfigurationError: If OCR extraction fails
+        """
+        client = await self._get_client(hf_token=hf_token)
+        logger.info(
+            "extracting_text_from_image",
+            image_path=image_path,
+            model=model,
+        )
+        try:
+            # Call /Multimodal_OCR3_generate_image API endpoint
+            # According to the MCP tool description, this yields raw text and Markdown-formatted text
+            loop = asyncio.get_running_loop()
+            # The API might require file upload first, then call the generate function
+            # For now, we'll use handle_file to upload and pass the path
+            result = await loop.run_in_executor(
+                None,
+                lambda: client.predict(
+                    image_path=handle_file(image_path),
+                    api_name="/Multimodal_OCR3_generate_image",
+                ),
+            )
+            # Extract text from result
+            extracted_text = self._extract_text_from_result(result)
+            logger.info(
+                "image_ocr_complete",
+                text_length=len(extracted_text),
+            )
+            return extracted_text
+        except Exception as e:
+            logger.error("image_ocr_failed", error=str(e), error_type=type(e).__name__)
+            raise ConfigurationError(f"Image OCR failed: {e}") from e
+    async def extract_text_from_image(
+        self,
+        image_data: np.ndarray | Image.Image | str,
+        hf_token: str | None = None,
+    ) -> str:
+        """Extract text from image data (numpy array, PIL Image, or file path).
+        Args:
+            image_data: Image as numpy array, PIL Image, or file path string
+        Returns:
+            Extracted text string
+        """
+        # Handle different input types
+        if isinstance(image_data, str):
+            # Assume it's a file path
+            image_path = image_data
+        elif isinstance(image_data, Image.Image):
+            # Save PIL Image to temp file
+            image_path = self._save_image_temp(image_data)
+        elif isinstance(image_data, np.ndarray):
+            # Convert numpy array to PIL Image, then save
+            pil_image = Image.fromarray(image_data)
+            image_path = self._save_image_temp(pil_image)
+        else:
+            raise ValueError(f"Unsupported image data type: {type(image_data)}")
+        try:
+            # Extract text from the image file
+            extracted_text = await self.extract_text(image_path, hf_token=hf_token)
+            return extracted_text
+        finally:
+            # Clean up temp file if we created it
+            if image_path != image_data or not isinstance(image_data, str):
+                try:
+                    Path(image_path).unlink(missing_ok=True)
+                except Exception as e:
+                    logger.warning("failed_to_cleanup_temp_file", path=image_path, error=str(e))
+    def _extract_text_from_result(self, api_result: Any) -> str:
+        """Extract text from API result.
+        Args:
+            api_result: Result from Gradio API
+        Returns:
+            Extracted text string
+        """
+        # The API yields raw text and Markdown-formatted text
+        # Result might be a string, tuple, or generator
+        if isinstance(api_result, str):
+            return api_result.strip()
+        if isinstance(api_result, tuple):
+            # Try to extract text from tuple
+            for item in api_result:
+                if isinstance(item, str):
+                    return item.strip()
+                # Check if it's a dict with text fields
+                if isinstance(item, dict):
+                    if "text" in item:
+                        return str(item["text"]).strip()
+                    if "content" in item:
+                        return str(item["content"]).strip()
+        # If result is a generator or async generator, we'd need to iterate
+        # For now, convert to string representation
+        if api_result is not None:
+            text = str(api_result).strip()
+            if text and text != "None":
+                return text
+        logger.warning("could_not_extract_text_from_result", result_type=type(api_result).__name__)
+        return ""
+    def _save_image_temp(self, image: Image.Image) -> str:
+        """Save PIL Image to temporary file.
+        Args:
+            image: PIL Image object
+        Returns:
+            Path to temporary image file
+        """
+        # Create temp file
+        temp_file = tempfile.NamedTemporaryFile(
+            suffix=".png",
+            delete=False,
+        )
+        temp_path = temp_file.name
+        temp_file.close()
+        try:
+            # Save image as PNG
+            image.save(temp_path, "PNG")
+            logger.debug("saved_image_temp", path=temp_path, size=image.size)
+            return temp_path
+        except Exception as e:
+            logger.error("failed_to_save_image_temp", error=str(e))
+            raise ConfigurationError(f"Failed to save image to temp file: {e}") from e
+@lru_cache(maxsize=1)
+def get_image_ocr_service() -> ImageOCRService:
+    """Get or create singleton Image OCR service instance.
+    Returns:
+        ImageOCRService instance
+    """
+    return ImageOCRService()

src/services/llamaindex_rag.py CHANGED Viewed

@@ -40,6 +40,7 @@ class LlamaIndexRAGService:
         similarity_top_k: int = 5,
         use_openai_embeddings: bool | None = None,
         use_in_memory: bool = False,
     ) -> None:
         """
         Initialize LlamaIndex RAG service.
@@ -51,6 +52,7 @@ class LlamaIndexRAGService:
             similarity_top_k: Number of top results to retrieve
             use_openai_embeddings: Force OpenAI embeddings (None = auto-detect)
             use_in_memory: Use in-memory ChromaDB client (useful for tests)
         """
         # Import dependencies and store references
         deps = self._import_dependencies()
@@ -71,6 +73,7 @@ class LlamaIndexRAGService:
         self.persist_dir = persist_dir or settings.chroma_db_path
         self.similarity_top_k = similarity_top_k
         self.use_in_memory = use_in_memory
         # Configure embeddings and LLM
         use_openai = use_openai_embeddings if use_openai_embeddings is not None else False
@@ -201,9 +204,15 @@ class LlamaIndexRAGService:
     def _configure_llm(self, huggingface_llm: Any, openai_llm: Any) -> None:
         """Configure LLM for query synthesis."""
-        if huggingface_llm is not None and (settings.hf_token or settings.huggingface_api_key):
             model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-            token = settings.hf_token or settings.huggingface_api_key
             # Check if it's HuggingFaceInferenceAPI (API-based) or HuggingFaceLLM (local)
             llm_class_name = (
@@ -430,6 +439,7 @@ class LlamaIndexRAGService:
 def get_rag_service(
     collection_name: str = "deepcritical_evidence",
     **kwargs: Any,
 ) -> LlamaIndexRAGService:
     """
@@ -437,6 +447,7 @@ def get_rag_service(
     Args:
         collection_name: Name of the ChromaDB collection
         **kwargs: Additional arguments for LlamaIndexRAGService
             Defaults to use_openai_embeddings=False (local embeddings)
@@ -450,4 +461,6 @@ def get_rag_service(
     # Default to local embeddings if not explicitly set
     if "use_openai_embeddings" not in kwargs:
         kwargs["use_openai_embeddings"] = False
-    return LlamaIndexRAGService(collection_name=collection_name, **kwargs)

         similarity_top_k: int = 5,
         use_openai_embeddings: bool | None = None,
         use_in_memory: bool = False,
+        oauth_token: str | None = None,
     ) -> None:
         """
         Initialize LlamaIndex RAG service.
             similarity_top_k: Number of top results to retrieve
             use_openai_embeddings: Force OpenAI embeddings (None = auto-detect)
             use_in_memory: Use in-memory ChromaDB client (useful for tests)
+            oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
         """
         # Import dependencies and store references
         deps = self._import_dependencies()
         self.persist_dir = persist_dir or settings.chroma_db_path
         self.similarity_top_k = similarity_top_k
         self.use_in_memory = use_in_memory
+        self.oauth_token = oauth_token
         # Configure embeddings and LLM
         use_openai = use_openai_embeddings if use_openai_embeddings is not None else False
     def _configure_llm(self, huggingface_llm: Any, openai_llm: Any) -> None:
         """Configure LLM for query synthesis."""
+        # Priority: oauth_token > env vars
+        effective_token = (
+            self.oauth_token
+            or settings.hf_token
+            or settings.huggingface_api_key
+        )
+        if huggingface_llm is not None and effective_token:
             model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
+            token = effective_token
             # Check if it's HuggingFaceInferenceAPI (API-based) or HuggingFaceLLM (local)
             llm_class_name = (
 def get_rag_service(
     collection_name: str = "deepcritical_evidence",
+    oauth_token: str | None = None,
     **kwargs: Any,
 ) -> LlamaIndexRAGService:
     """
     Args:
         collection_name: Name of the ChromaDB collection
+        oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
         **kwargs: Additional arguments for LlamaIndexRAGService
             Defaults to use_openai_embeddings=False (local embeddings)
     # Default to local embeddings if not explicitly set
     if "use_openai_embeddings" not in kwargs:
         kwargs["use_openai_embeddings"] = False
+    return LlamaIndexRAGService(
+        collection_name=collection_name, oauth_token=oauth_token, **kwargs
+    )

src/services/multimodal_processing.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""Unified multimodal processing service for text, audio, and image inputs."""
+from functools import lru_cache
+from typing import Any
+import structlog
+from gradio.data_classes import FileData
+from src.services.audio_processing import AudioService, get_audio_service
+from src.services.image_ocr import ImageOCRService, get_image_ocr_service
+from src.utils.config import settings
+logger = structlog.get_logger(__name__)
+class MultimodalService:
+    """Unified multimodal processing service."""
+    def __init__(
+        self,
+        audio_service: AudioService | None = None,
+        ocr_service: ImageOCRService | None = None,
+    ) -> None:
+        """Initialize multimodal service.
+        Args:
+            audio_service: Audio service instance (default: get_audio_service())
+            ocr_service: Image OCR service instance (default: get_image_ocr_service())
+        """
+        self.audio = audio_service or get_audio_service()
+        self.ocr = ocr_service or get_image_ocr_service()
+    async def process_multimodal_input(
+        self,
+        text: str,
+        files: list[FileData] | None = None,
+        audio_input: tuple[int, Any] | None = None,
+        hf_token: str | None = None,
+    ) -> str:
+        """Process multimodal input (text + images + audio) and return combined text.
+        Args:
+            text: Text input string
+            files: List of uploaded files (images, audio, etc.)
+            audio_input: Audio input tuple (sample_rate, audio_array)
+            hf_token: HuggingFace token for authenticated Gradio Spaces
+        Returns:
+            Combined text from all inputs
+        """
+        text_parts: list[str] = []
+        # Add original text if present
+        if text and text.strip():
+            text_parts.append(text.strip())
+        # Process audio input
+        if audio_input is not None and settings.enable_audio_input:
+            try:
+                transcribed = await self.audio.process_audio_input(audio_input, hf_token=hf_token)
+                if transcribed:
+                    text_parts.append(f"[Audio transcription: {transcribed}]")
+            except Exception as e:
+                logger.warning("audio_processing_failed", error=str(e))
+        # Process uploaded files
+        if files:
+            for file_data in files:
+                file_path = file_data.path if isinstance(file_data, FileData) else str(file_data)
+                # Check if it's an image
+                if self._is_image_file(file_path):
+                    try:
+                        extracted_text = await self.ocr.extract_text(file_path, hf_token=hf_token)
+                        if extracted_text:
+                            text_parts.append(f"[Image OCR: {extracted_text}]")
+                    except Exception as e:
+                        logger.warning("image_ocr_failed", file_path=file_path, error=str(e))
+                # Check if it's an audio file
+                elif self._is_audio_file(file_path):
+                    try:
+                        # For audio files, we'd need to load and transcribe
+                        # For now, log a warning
+                        logger.warning("audio_file_upload_not_supported", file_path=file_path)
+                    except Exception as e:
+                        logger.warning("audio_file_processing_failed", file_path=file_path, error=str(e))
+        # Combine all text parts
+        combined_text = "\n\n".join(text_parts) if text_parts else ""
+        logger.info(
+            "multimodal_input_processed",
+            text_length=len(combined_text),
+            num_files=len(files) if files else 0,
+            has_audio=audio_input is not None,
+        )
+        return combined_text
+    def _is_image_file(self, file_path: str) -> bool:
+        """Check if file is an image.
+        Args:
+            file_path: Path to file
+        Returns:
+            True if file is an image
+        """
+        image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".tif"}
+        return any(file_path.lower().endswith(ext) for ext in image_extensions)
+    def _is_audio_file(self, file_path: str) -> bool:
+        """Check if file is an audio file.
+        Args:
+            file_path: Path to file
+        Returns:
+            True if file is an audio file
+        """
+        audio_extensions = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".aac", ".wma"}
+        return any(file_path.lower().endswith(ext) for ext in audio_extensions)
+@lru_cache(maxsize=1)
+def get_multimodal_service() -> MultimodalService:
+    """Get or create singleton multimodal service instance.
+    Returns:
+        MultimodalService instance
+    """
+    return MultimodalService()

src/services/stt_gradio.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""Speech-to-Text service using Gradio Client API."""
+import asyncio
+import tempfile
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+import numpy as np
+import structlog
+from gradio_client import Client, handle_file
+from src.utils.config import settings
+from src.utils.exceptions import ConfigurationError
+logger = structlog.get_logger(__name__)
+class STTService:
+    """STT service using nvidia/canary-1b-v2 Gradio Space."""
+    def __init__(self, api_url: str | None = None, hf_token: str | None = None) -> None:
+        """Initialize STT service.
+        Args:
+            api_url: Gradio Space URL (default: settings.stt_api_url)
+            hf_token: HuggingFace token for authenticated Spaces (default: None)
+        Raises:
+            ConfigurationError: If API URL not configured
+        """
+        self.api_url = api_url or settings.stt_api_url
+        if not self.api_url:
+            raise ConfigurationError("STT API URL not configured")
+        self.hf_token = hf_token
+        self.client: Client | None = None
+    async def _get_client(self, hf_token: str | None = None) -> Client:
+        """Get or create Gradio Client (lazy initialization).
+        Args:
+            hf_token: HuggingFace token for authenticated Spaces (overrides instance token)
+        Returns:
+            Gradio Client instance
+        """
+        # Use provided token or instance token
+        token = hf_token or self.hf_token
+        # If client exists but token changed, recreate it
+        if self.client is not None and token != self.hf_token:
+            self.client = None
+        if self.client is None:
+            loop = asyncio.get_running_loop()
+            # Pass token to Client for authenticated Spaces
+            if token:
+                self.client = await loop.run_in_executor(
+                    None,
+                    lambda: Client(self.api_url, hf_token=token),
+                )
+            else:
+                self.client = await loop.run_in_executor(
+                    None,
+                    lambda: Client(self.api_url),
+                )
+            # Update instance token for future use
+            self.hf_token = token
+        return self.client
+    async def transcribe_file(
+        self,
+        audio_path: str,
+        source_lang: str | None = None,
+        target_lang: str | None = None,
+        hf_token: str | None = None,
+    ) -> str:
+        """Transcribe audio file using Gradio API.
+        Args:
+            audio_path: Path to audio file
+            source_lang: Source language (default: settings.stt_source_lang)
+            target_lang: Target language (default: settings.stt_target_lang)
+        Returns:
+            Transcribed text string
+        Raises:
+            ConfigurationError: If transcription fails
+        """
+        client = await self._get_client(hf_token=hf_token)
+        source_lang = source_lang or settings.stt_source_lang
+        target_lang = target_lang or settings.stt_target_lang
+        logger.info(
+            "transcribing_audio_file",
+            audio_path=audio_path,
+            source_lang=source_lang,
+            target_lang=target_lang,
+        )
+        try:
+            # Call /transcribe_file API endpoint
+            # API returns: (dataframe, csv_path, srt_path)
+            loop = asyncio.get_running_loop()
+            result = await loop.run_in_executor(
+                None,
+                lambda: client.predict(
+                    audio_path=handle_file(audio_path),
+                    source_lang=source_lang,
+                    target_lang=target_lang,
+                    api_name="/transcribe_file",
+                ),
+            )
+            # Extract transcription from result
+            transcribed_text = self._extract_transcription(result)
+            logger.info(
+                "audio_transcription_complete",
+                text_length=len(transcribed_text),
+            )
+            return transcribed_text
+        except Exception as e:
+            logger.error("audio_transcription_failed", error=str(e), error_type=type(e).__name__)
+            raise ConfigurationError(f"Audio transcription failed: {e}") from e
+    async def transcribe_audio(
+        self,
+        audio_data: tuple[int, np.ndarray],
+        hf_token: str | None = None,
+    ) -> str:
+        """Transcribe audio numpy array to text.
+        Args:
+            audio_data: Tuple of (sample_rate, audio_array)
+        Returns:
+            Transcribed text string
+        """
+        sample_rate, audio_array = audio_data
+        logger.info(
+            "transcribing_audio_array",
+            sample_rate=sample_rate,
+            audio_shape=audio_array.shape,
+        )
+        # Save audio to temp file
+        temp_path = self._save_audio_temp(audio_data)
+        try:
+            # Transcribe the temp file
+            transcribed_text = await self.transcribe_file(temp_path, hf_token=hf_token)
+            return transcribed_text
+        finally:
+            # Clean up temp file
+            try:
+                Path(temp_path).unlink(missing_ok=True)
+            except Exception as e:
+                logger.warning("failed_to_cleanup_temp_file", path=temp_path, error=str(e))
+    def _extract_transcription(self, api_result: tuple) -> str:
+        """Extract transcription text from API result.
+        Args:
+            api_result: Tuple from Gradio API (dataframe, csv_path, srt_path)
+        Returns:
+            Extracted transcription text
+        """
+        # API returns: (dataframe, csv_path, srt_path)
+        # Try to extract from dataframe first
+        if isinstance(api_result, tuple) and len(api_result) >= 1:
+            dataframe = api_result[0]
+            if isinstance(dataframe, dict) and "data" in dataframe:
+                # Extract text from dataframe rows
+                rows = dataframe.get("data", [])
+                if rows:
+                    # Combine all text segments
+                    text_segments = []
+                    for row in rows:
+                        if isinstance(row, list) and len(row) > 0:
+                            # First column is usually the text
+                            text_segments.append(str(row[0]))
+                    if text_segments:
+                        return " ".join(text_segments)
+            # Fallback: try to read CSV file if available
+            if len(api_result) >= 2 and api_result[1]:
+                csv_path = api_result[1]
+                try:
+                    import pandas as pd
+                    df = pd.read_csv(csv_path)
+                    if "text" in df.columns:
+                        return " ".join(df["text"].astype(str).tolist())
+                    elif len(df.columns) > 0:
+                        # Use first column
+                        return " ".join(df.iloc[:, 0].astype(str).tolist())
+                except Exception as e:
+                    logger.warning("failed_to_read_csv", csv_path=csv_path, error=str(e))
+        # Last resort: return empty string
+        logger.warning("could_not_extract_transcription", result_type=type(api_result).__name__)
+        return ""
+    def _save_audio_temp(
+        self,
+        audio_data: tuple[int, np.ndarray],
+    ) -> str:
+        """Save audio numpy array to temporary WAV file.
+        Args:
+            audio_data: Tuple of (sample_rate, audio_array)
+        Returns:
+            Path to temporary WAV file
+        """
+        sample_rate, audio_array = audio_data
+        # Create temp file
+        temp_file = tempfile.NamedTemporaryFile(
+            suffix=".wav",
+            delete=False,
+        )
+        temp_path = temp_file.name
+        temp_file.close()
+        # Save audio using soundfile
+        try:
+            import soundfile as sf
+            # Ensure audio is float32 and mono
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+            # Handle stereo -> mono conversion
+            if len(audio_array.shape) > 1:
+                audio_array = np.mean(audio_array, axis=1)
+            # Normalize to [-1, 1] range
+            if audio_array.max() > 1.0 or audio_array.min() < -1.0:
+                audio_array = audio_array / np.max(np.abs(audio_array))
+            sf.write(temp_path, audio_array, sample_rate)
+            logger.debug("saved_audio_temp", path=temp_path, sample_rate=sample_rate)
+            return temp_path
+        except ImportError:
+            raise ConfigurationError(
+                "soundfile not installed. Install with: uv add soundfile"
+            ) from None
+        except Exception as e:
+            logger.error("failed_to_save_audio_temp", error=str(e))
+            raise ConfigurationError(f"Failed to save audio to temp file: {e}") from e
+@lru_cache(maxsize=1)
+def get_stt_service() -> STTService:
+    """Get or create singleton STT service instance.
+    Returns:
+        STTService instance
+    """
+    return STTService()

src/services/tts_modal.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""Text-to-Speech service using Kokoro 82M via Modal GPU."""
+import asyncio
+from functools import lru_cache
+from typing import Any
+import numpy as np
+import structlog
+from src.utils.config import settings
+from src.utils.exceptions import ConfigurationError
+logger = structlog.get_logger(__name__)
+# Kokoro TTS dependencies for Modal image
+KOKORO_DEPENDENCIES = [
+    "torch>=2.0.0",
+    "transformers>=4.30.0",
+    "numpy<2.0",
+    # kokoro-82M can be installed from source:
+    # git+https://github.com/hexgrad/kokoro.git
+]
+# Modal app and function definitions (module-level for Modal)
+_modal_app: Any | None = None
+_tts_function: Any | None = None
+def _get_modal_app() -> Any:
+    """Get or create Modal app instance."""
+    global _modal_app
+    if _modal_app is None:
+        try:
+            import modal
+            _modal_app = modal.App.lookup("deepcritical-tts", create_if_missing=True)
+        except ImportError as e:
+            raise ConfigurationError(
+                "Modal SDK not installed. Run: uv sync or pip install modal>=0.63.0"
+            ) from e
+    return _modal_app
+# Define Modal image with Kokoro dependencies (module-level)
+def _get_tts_image() -> Any:
+    """Get Modal image with Kokoro dependencies."""
+    try:
+        import modal
+        return (
+            modal.Image.debian_slim(python_version="3.11")
+            .pip_install(*KOKORO_DEPENDENCIES)
+            .pip_install("git+https://github.com/hexgrad/kokoro.git")
+        )
+    except ImportError:
+        return None
+def _setup_modal_function() -> None:
+    """Setup Modal GPU function for TTS (called once, lazy initialization).
+    Note: GPU type is set at function definition time. Changes to settings.tts_gpu
+    require app restart to take effect.
+    """
+    global _tts_function, _modal_app
+    if _tts_function is not None:
+        return  # Already set up
+    try:
+        import modal
+        app = _get_modal_app()
+        tts_image = _get_tts_image()
+        if tts_image is None:
+            raise ConfigurationError("Modal image setup failed")
+        # Get GPU and timeout from settings (with defaults)
+        # Note: These are evaluated at function definition time, not at call time
+        # Changes to settings require app restart
+        gpu_type = getattr(settings, "tts_gpu", None) or "T4"
+        timeout_seconds = getattr(settings, "tts_timeout", None) or 60
+        # Define GPU function at module level (required by Modal)
+        # Modal functions are immutable once defined, so GPU changes require restart
+        @app.function(
+            image=tts_image,
+            gpu=gpu_type,
+            timeout=timeout_seconds,
+        )
+        def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
+            """Modal GPU function for Kokoro TTS.
+            This function runs on Modal's GPU infrastructure.
+            Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
+            Reference: https://huggingface.co/spaces/hexgrad/Kokoro-TTS/raw/main/app.py
+            """
+            import numpy as np
+            # Import Kokoro inside function (lazy load)
+            try:
+                from kokoro import KModel, KPipeline
+                import torch
+                # Initialize model (cached on GPU)
+                model = KModel().to("cuda").eval()
+                pipeline = KPipeline(lang_code=voice[0])
+                pack = pipeline.load_voice(voice)
+                # Generate audio
+                for _, ps, _ in pipeline(text, voice, speed):
+                    ref_s = pack[len(ps) - 1]
+                    audio = model(ps, ref_s, speed)
+                    return (24000, audio.numpy())
+                # If no audio generated, return empty
+                return (24000, np.zeros(1, dtype=np.float32))
+            except ImportError as e:
+                raise ConfigurationError(
+                    "Kokoro not installed. Install with: pip install git+https://github.com/hexgrad/kokoro.git"
+                ) from e
+            except Exception as e:
+                raise ConfigurationError(f"TTS synthesis failed: {e}") from e
+        # Store function reference for remote calls
+        _tts_function = kokoro_tts_function
+        # Verify function is properly attached to app
+        if not hasattr(app, kokoro_tts_function.__name__):
+            logger.warning("modal_function_not_attached", function_name=kokoro_tts_function.__name__)
+        logger.info(
+            "modal_tts_function_setup_complete",
+            gpu=gpu_type,
+            timeout=timeout_seconds,
+            function_name=kokoro_tts_function.__name__,
+        )
+    except Exception as e:
+        logger.error("modal_tts_function_setup_failed", error=str(e))
+        raise ConfigurationError(f"Failed to setup Modal TTS function: {e}") from e
+class ModalTTSExecutor:
+    """Execute Kokoro TTS synthesis on Modal GPU.
+    This class provides TTS synthesis using Kokoro 82M model on Modal's GPU infrastructure.
+    Follows the same pattern as ModalCodeExecutor but uses GPU functions for TTS.
+    """
+    def __init__(self) -> None:
+        """Initialize Modal TTS executor.
+        Note:
+            Logs a warning if Modal credentials are not configured.
+            Execution will fail at runtime without valid credentials.
+        """
+        # Check for Modal credentials
+        if not settings.modal_available:
+            logger.warning(
+                "Modal credentials not found. TTS will not be available unless modal setup is run."
+            )
+    def synthesize(
+        self,
+        text: str,
+        voice: str = "af_heart",
+        speed: float = 1.0,
+        timeout: int = 60,
+    ) -> tuple[int, np.ndarray]:
+        """Synthesize text to speech using Kokoro on Modal GPU.
+        Args:
+            text: Text to synthesize (max 5000 chars for free tier)
+            voice: Voice ID from Kokoro (e.g., af_heart, af_bella, am_michael)
+            speed: Speech speed multiplier (0.5-2.0)
+            timeout: Maximum execution time (not used, Modal function has its own timeout)
+        Returns:
+            Tuple of (sample_rate, audio_array)
+        Raises:
+            ConfigurationError: If synthesis fails
+        """
+        # Setup Modal function if not already done
+        _setup_modal_function()
+        if _tts_function is None:
+            raise ConfigurationError("Modal TTS function not initialized")
+        logger.info("synthesizing_tts", text_length=len(text), voice=voice, speed=speed)
+        try:
+            # Call the GPU function remotely
+            result = _tts_function.remote(text, voice, speed)
+            logger.info("tts_synthesis_complete", sample_rate=result[0], audio_shape=result[1].shape)
+            return result
+        except Exception as e:
+            logger.error("tts_synthesis_failed", error=str(e), error_type=type(e).__name__)
+            raise ConfigurationError(f"TTS synthesis failed: {e}") from e
+class TTSService:
+    """TTS service wrapper for async usage."""
+    def __init__(self) -> None:
+        """Initialize TTS service."""
+        if not settings.modal_available:
+            raise ConfigurationError("Modal credentials required for TTS")
+        self.executor = ModalTTSExecutor()
+    async def synthesize_async(
+        self,
+        text: str,
+        voice: str = "af_heart",
+        speed: float = 1.0,
+    ) -> tuple[int, np.ndarray] | None:
+        """Async wrapper for TTS synthesis.
+        Args:
+            text: Text to synthesize
+            voice: Voice ID (default: settings.tts_voice)
+            speed: Speech speed (default: settings.tts_speed)
+        Returns:
+            Tuple of (sample_rate, audio_array) or None if error
+        """
+        voice = voice or settings.tts_voice
+        speed = speed or settings.tts_speed
+        loop = asyncio.get_running_loop()
+        try:
+            result = await loop.run_in_executor(
+                None,
+                lambda: self.executor.synthesize(text, voice, speed),
+            )
+            return result
+        except Exception as e:
+            logger.error("tts_synthesis_async_failed", error=str(e))
+            return None
+@lru_cache(maxsize=1)
+def get_tts_service() -> TTSService:
+    """Get or create singleton TTS service instance.
+    Returns:
+        TTSService instance
+    Raises:
+        ConfigurationError: If Modal credentials not configured
+    """
+    return TTSService()

src/tools/crawl_adapter.py CHANGED Viewed

	@@ -60,3 +60,5 @@ async def crawl_website(starting_url: str) -> str:
60
61
62


60
61
62
63	+
64	+

src/tools/rag_tool.py CHANGED Viewed

@@ -23,14 +23,20 @@ class RAGTool:
     Returns Evidence objects from RAG retrieval results.
     """
-    def __init__(self, rag_service: "LlamaIndexRAGService | None" = None) -> None:
         """
         Initialize RAG tool.
         Args:
             rag_service: Optional RAG service instance. If None, will be lazy-initialized.
         """
         self._rag_service = rag_service
         self.logger = logger
     @property
@@ -54,9 +60,11 @@ class RAGTool:
                 # Use local embeddings by default (no API key required)
                 # Use in-memory ChromaDB to avoid file system issues
                 self._rag_service = get_rag_service(
                     use_openai_embeddings=False,
                     use_in_memory=True,  # Use in-memory for better reliability
                 )
                 self.logger.info("RAG service initialized with local embeddings")
             except (ConfigurationError, ImportError) as e:
@@ -170,12 +178,14 @@ class RAGTool:
 def create_rag_tool(
     rag_service: "LlamaIndexRAGService | None" = None,
 ) -> RAGTool:
     """
     Factory function to create a RAG tool.
     Args:
         rag_service: Optional RAG service instance. If None, will be lazy-initialized.
     Returns:
         Configured RAGTool instance
@@ -184,7 +194,7 @@ def create_rag_tool(
         ConfigurationError: If RAG service cannot be initialized and rag_service is None
     """
     try:
-        return RAGTool(rag_service=rag_service)
     except Exception as e:
         logger.error("Failed to create RAG tool", error=str(e))
         raise ConfigurationError(f"Failed to create RAG tool: {e}") from e

     Returns Evidence objects from RAG retrieval results.
     """
+    def __init__(
+        self,
+        rag_service: "LlamaIndexRAGService | None" = None,
+        oauth_token: str | None = None,
+    ) -> None:
         """
         Initialize RAG tool.
         Args:
             rag_service: Optional RAG service instance. If None, will be lazy-initialized.
+            oauth_token: Optional OAuth token from HuggingFace login (for RAG LLM)
         """
         self._rag_service = rag_service
+        self.oauth_token = oauth_token
         self.logger = logger
     @property
                 # Use local embeddings by default (no API key required)
                 # Use in-memory ChromaDB to avoid file system issues
+                # Pass OAuth token for LLM query synthesis
                 self._rag_service = get_rag_service(
                     use_openai_embeddings=False,
                     use_in_memory=True,  # Use in-memory for better reliability
+                    oauth_token=self.oauth_token,
                 )
                 self.logger.info("RAG service initialized with local embeddings")
             except (ConfigurationError, ImportError) as e:
 def create_rag_tool(
     rag_service: "LlamaIndexRAGService | None" = None,
+    oauth_token: str | None = None,
 ) -> RAGTool:
     """
     Factory function to create a RAG tool.
     Args:
         rag_service: Optional RAG service instance. If None, will be lazy-initialized.
+        oauth_token: Optional OAuth token from HuggingFace login (for RAG LLM)
     Returns:
         Configured RAGTool instance
         ConfigurationError: If RAG service cannot be initialized and rag_service is None
     """
     try:
+        return RAGTool(rag_service=rag_service, oauth_token=oauth_token)
     except Exception as e:
         logger.error("Failed to create RAG tool", error=str(e))
         raise ConfigurationError(f"Failed to create RAG tool: {e}") from e

src/tools/search_handler.py CHANGED Viewed

@@ -27,6 +27,7 @@ class SearchHandler:
         timeout: float = 30.0,
         include_rag: bool = False,
         auto_ingest_to_rag: bool = True,
     ) -> None:
         """
         Initialize the search handler.
@@ -36,10 +37,12 @@ class SearchHandler:
             timeout: Timeout for each search in seconds
             include_rag: Whether to include RAG tool in searches
             auto_ingest_to_rag: Whether to automatically ingest results into RAG
         """
         self.tools = list(tools)  # Make a copy
         self.timeout = timeout
         self.auto_ingest_to_rag = auto_ingest_to_rag
         self._rag_service: LlamaIndexRAGService | None = None
         if include_rag:
@@ -48,7 +51,7 @@ class SearchHandler:
     def add_rag_tool(self) -> None:
         """Add RAG tool to the tools list if available."""
         try:
-            rag_tool = create_rag_tool()
             self.tools.append(rag_tool)
             logger.info("RAG tool added to search handler")
         except ConfigurationError:
@@ -67,9 +70,11 @@ class SearchHandler:
                 # Use local embeddings by default (no API key required)
                 # Use in-memory ChromaDB to avoid file system issues
                 self._rag_service = get_rag_service(
                     use_openai_embeddings=False,
                     use_in_memory=True,  # Use in-memory for better reliability
                 )
                 logger.info("RAG service initialized for ingestion with local embeddings")
             except (ConfigurationError, ImportError):

         timeout: float = 30.0,
         include_rag: bool = False,
         auto_ingest_to_rag: bool = True,
+        oauth_token: str | None = None,
     ) -> None:
         """
         Initialize the search handler.
             timeout: Timeout for each search in seconds
             include_rag: Whether to include RAG tool in searches
             auto_ingest_to_rag: Whether to automatically ingest results into RAG
+            oauth_token: Optional OAuth token from HuggingFace login (for RAG LLM)
         """
         self.tools = list(tools)  # Make a copy
         self.timeout = timeout
         self.auto_ingest_to_rag = auto_ingest_to_rag
+        self.oauth_token = oauth_token
         self._rag_service: LlamaIndexRAGService | None = None
         if include_rag:
     def add_rag_tool(self) -> None:
         """Add RAG tool to the tools list if available."""
         try:
+            rag_tool = create_rag_tool(oauth_token=self.oauth_token)
             self.tools.append(rag_tool)
             logger.info("RAG tool added to search handler")
         except ConfigurationError:
                 # Use local embeddings by default (no API key required)
                 # Use in-memory ChromaDB to avoid file system issues
+                # Pass OAuth token for LLM query synthesis
                 self._rag_service = get_rag_service(
                     use_openai_embeddings=False,
                     use_in_memory=True,  # Use in-memory for better reliability
+                    oauth_token=self.oauth_token,
                 )
                 logger.info("RAG service initialized for ingestion with local embeddings")
             except (ConfigurationError, ImportError):

src/tools/web_search_adapter.py CHANGED Viewed

	@@ -65,3 +65,5 @@ async def web_search(query: str) -> str:
65
66
67


65
66
67
68	+
69	+

src/utils/config.py CHANGED Viewed

@@ -140,6 +140,62 @@ class Settings(BaseSettings):
         description="Automatically ingest evidence into RAG",
     )
     @property
     def modal_available(self) -> bool:
         """Check if Modal credentials are configured."""
@@ -203,6 +259,16 @@ class Settings(BaseSettings):
             return bool(self.tavily_api_key)
         return False
 def get_settings() -> Settings:
     """Factory function to get settings (allows mocking in tests)."""

         description="Automatically ingest evidence into RAG",
     )
+    # Audio Processing Configuration
+    tts_model: str = Field(
+        default="hexgrad/Kokoro-82M",
+        description="Kokoro TTS model ID for text-to-speech",
+    )
+    tts_voice: str = Field(
+        default="af_heart",
+        description="Kokoro voice ID (e.g., af_heart, af_bella, am_michael)",
+    )
+    tts_speed: float = Field(
+        default=1.0,
+        ge=0.5,
+        le=2.0,
+        description="TTS speech speed multiplier",
+    )
+    tts_gpu: str | None = Field(
+        default="T4",
+        description="Modal GPU type for TTS (T4, A10, A100, etc.)",
+    )
+    tts_timeout: int = Field(
+        default=60,
+        ge=10,
+        le=300,
+        description="TTS synthesis timeout in seconds",
+    )
+    stt_api_url: str = Field(
+        default="nvidia/canary-1b-v2",
+        description="Gradio Space URL for STT API (nvidia/canary-1b-v2)",
+    )
+    stt_source_lang: str = Field(
+        default="English",
+        description="Source language for STT transcription",
+    )
+    stt_target_lang: str = Field(
+        default="English",
+        description="Target language for STT transcription",
+    )
+    enable_audio_input: bool = Field(
+        default=True,
+        description="Enable audio input (microphone/file upload)",
+    )
+    enable_audio_output: bool = Field(
+        default=True,
+        description="Enable audio output (TTS response)",
+    )
+    # Image OCR Configuration
+    ocr_api_url: str = Field(
+        default="prithivMLmods/Multimodal-OCR3",
+        description="Gradio Space URL for image OCR API",
+    )
+    enable_image_input: bool = Field(
+        default=True,
+        description="Enable image input (file upload with OCR)",
+    )
     @property
     def modal_available(self) -> bool:
         """Check if Modal credentials are configured."""
             return bool(self.tavily_api_key)
         return False
+    @property
+    def audio_available(self) -> bool:
+        """Check if audio processing is available (Modal + STT API)."""
+        return self.modal_available and bool(self.stt_api_url)
+    @property
+    def image_ocr_available(self) -> bool:
+        """Check if image OCR is available (OCR API URL configured)."""
+        return bool(self.ocr_api_url)
 def get_settings() -> Settings:
     """Factory function to get settings (allows mocking in tests)."""

src/utils/llm_factory.py CHANGED Viewed

@@ -50,13 +50,16 @@ def get_magentic_client() -> "OpenAIChatClient":
     )
-def get_huggingface_chat_client() -> "HuggingFaceChatClient":
     """
     Get HuggingFace chat client for agent-framework.
     HuggingFace InferenceClient natively supports function calling,
     making it compatible with agent-framework's ChatAgent.
     Returns:
         Configured HuggingFaceChatClient
@@ -66,7 +69,8 @@ def get_huggingface_chat_client() -> "HuggingFaceChatClient":
     from src.utils.huggingface_chat_client import HuggingFaceChatClient
     model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-    api_key = settings.hf_token or settings.huggingface_api_key
     return HuggingFaceChatClient(
         model_name=model_name,
@@ -75,7 +79,7 @@ def get_huggingface_chat_client() -> "HuggingFaceChatClient":
     )
-def get_chat_client_for_agent() -> Any:
     """
     Get appropriate chat client for agent-framework based on configuration.
@@ -83,15 +87,21 @@ def get_chat_client_for_agent() -> Any:
     - HuggingFace InferenceClient (if HF_TOKEN available, preferred for free tier)
     - OpenAI ChatClient (if OPENAI_API_KEY available, fallback)
     Returns:
         ChatClient compatible with agent-framework (HuggingFaceChatClient or OpenAIChatClient)
     Raises:
         ConfigurationError: If no suitable client can be created
     """
     # Prefer HuggingFace if available (free tier)
-    if settings.has_huggingface_key:
-        return get_huggingface_chat_client()
     # Fallback to OpenAI if available
     if settings.has_openai_key:
@@ -99,7 +109,7 @@ def get_chat_client_for_agent() -> Any:
     # If neither available, try HuggingFace without key (public models)
     try:
-        return get_huggingface_chat_client()
     except Exception:
         pass
@@ -108,7 +118,7 @@ def get_chat_client_for_agent() -> Any:
     )
-def get_pydantic_ai_model() -> Any:
     """
     Get the appropriate model for pydantic-ai based on configuration.
@@ -116,6 +126,9 @@ def get_pydantic_ai_model() -> Any:
     Defaults to HuggingFace if provider is not specified or unknown.
     This is used by simple mode components (JudgeHandler, etc.)
     Returns:
         Configured pydantic-ai model
     """
@@ -126,9 +139,12 @@ def get_pydantic_ai_model() -> Any:
     from pydantic_ai.providers.huggingface import HuggingFaceProvider
     from pydantic_ai.providers.openai import OpenAIProvider
     if settings.llm_provider == "huggingface":
         model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-        hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
         return HuggingFaceModel(model_name, provider=hf_provider)
     if settings.llm_provider == "openai":
@@ -145,7 +161,7 @@ def get_pydantic_ai_model() -> Any:
     # Default to HuggingFace if provider is unknown or not specified
     model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
-    hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
     return HuggingFaceModel(model_name, provider=hf_provider)

     )
+def get_huggingface_chat_client(oauth_token: str | None = None) -> "HuggingFaceChatClient":
     """
     Get HuggingFace chat client for agent-framework.
     HuggingFace InferenceClient natively supports function calling,
     making it compatible with agent-framework's ChatAgent.
+    Args:
+        oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
     Returns:
         Configured HuggingFaceChatClient
     from src.utils.huggingface_chat_client import HuggingFaceChatClient
     model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
+    # Priority: oauth_token > env vars
+    api_key = oauth_token or settings.hf_token or settings.huggingface_api_key
     return HuggingFaceChatClient(
         model_name=model_name,
     )
+def get_chat_client_for_agent(oauth_token: str | None = None) -> Any:
     """
     Get appropriate chat client for agent-framework based on configuration.
     - HuggingFace InferenceClient (if HF_TOKEN available, preferred for free tier)
     - OpenAI ChatClient (if OPENAI_API_KEY available, fallback)
+    Args:
+        oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
     Returns:
         ChatClient compatible with agent-framework (HuggingFaceChatClient or OpenAIChatClient)
     Raises:
         ConfigurationError: If no suitable client can be created
     """
+    # Check if we have OAuth token or env vars
+    has_hf_key = bool(oauth_token or settings.has_huggingface_key)
     # Prefer HuggingFace if available (free tier)
+    if has_hf_key:
+        return get_huggingface_chat_client(oauth_token=oauth_token)
     # Fallback to OpenAI if available
     if settings.has_openai_key:
     # If neither available, try HuggingFace without key (public models)
     try:
+        return get_huggingface_chat_client(oauth_token=oauth_token)
     except Exception:
         pass
     )
+def get_pydantic_ai_model(oauth_token: str | None = None) -> Any:
     """
     Get the appropriate model for pydantic-ai based on configuration.
     Defaults to HuggingFace if provider is not specified or unknown.
     This is used by simple mode components (JudgeHandler, etc.)
+    Args:
+        oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
     Returns:
         Configured pydantic-ai model
     """
     from pydantic_ai.providers.huggingface import HuggingFaceProvider
     from pydantic_ai.providers.openai import OpenAIProvider
+    # Priority: oauth_token > env vars
+    effective_hf_token = oauth_token or settings.hf_token or settings.huggingface_api_key
     if settings.llm_provider == "huggingface":
         model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
+        hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
         return HuggingFaceModel(model_name, provider=hf_provider)
     if settings.llm_provider == "openai":
     # Default to HuggingFace if provider is unknown or not specified
     model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
+    hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
     return HuggingFaceModel(model_name, provider=hf_provider)

tests/unit/middleware/__init__.py CHANGED Viewed

	@@ -17,3 +17,5 @@
17
18
19


17
18
19
20	+
21	+