Joseph Pollack commited on
Commit
0467062
·
unverified ·
1 Parent(s): 1ea3854

adds new features and graphs integration with configuration options

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +54 -0
  2. WEB_SEARCH_TOOL_ASSESSMENT.md +239 -0
  3. docs/api/agents.md +2 -0
  4. docs/api/models.md +2 -0
  5. docs/api/orchestrators.md +2 -0
  6. docs/api/services.md +2 -0
  7. docs/api/tools.md +2 -0
  8. docs/architecture/agents.md +2 -0
  9. docs/architecture/middleware.md +2 -0
  10. docs/architecture/services.md +2 -0
  11. docs/architecture/tools.md +2 -0
  12. docs/contributing/code-quality.md +2 -0
  13. docs/contributing/code-style.md +2 -0
  14. docs/contributing/error-handling.md +2 -0
  15. docs/contributing/implementation-patterns.md +2 -0
  16. docs/contributing/index.md +2 -0
  17. docs/contributing/prompt-engineering.md +2 -0
  18. docs/contributing/testing.md +2 -0
  19. docs/getting-started/examples.md +2 -0
  20. docs/getting-started/installation.md +2 -0
  21. docs/getting-started/mcp-integration.md +2 -0
  22. docs/getting-started/quick-start.md +2 -0
  23. docs/implementation/IMPLEMENTATION_SUMMARY.md +180 -0
  24. docs/implementation/TOKEN_AUTHENTICATION_REVIEW.md +201 -0
  25. docs/implementation/TTS_MODAL_IMPLEMENTATION.md +134 -0
  26. docs/license.md +2 -0
  27. docs/overview/architecture.md +2 -0
  28. docs/overview/features.md +2 -0
  29. docs/team.md +2 -0
  30. new_env.txt +96 -0
  31. pyproject.toml +10 -6
  32. src/agent_factory/judges.py +16 -7
  33. src/app.py +193 -23
  34. src/mcp_tools.py +78 -0
  35. src/middleware/state_machine.py +2 -0
  36. src/orchestrator/graph_orchestrator.py +102 -3
  37. src/orchestrator_factory.py +2 -0
  38. src/services/audio_processing.py +134 -0
  39. src/services/image_ocr.py +242 -0
  40. src/services/llamaindex_rag.py +16 -3
  41. src/services/multimodal_processing.py +136 -0
  42. src/services/stt_gradio.py +271 -0
  43. src/services/tts_modal.py +260 -0
  44. src/tools/crawl_adapter.py +2 -0
  45. src/tools/rag_tool.py +12 -2
  46. src/tools/search_handler.py +6 -1
  47. src/tools/web_search_adapter.py +2 -0
  48. src/utils/config.py +66 -0
  49. src/utils/llm_factory.py +25 -9
  50. tests/unit/middleware/__init__.py +2 -0
.env.example CHANGED
@@ -11,6 +11,52 @@ ANTHROPIC_API_KEY=sk-ant-your-key-here
11
  # ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
12
  # OPENAI_MODEL=gpt-5.1
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # ============== EMBEDDINGS ==============
15
 
16
  # OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
@@ -39,6 +85,14 @@ MAX_ITERATIONS=10
39
  SEARCH_TIMEOUT=30
40
  LOG_LEVEL=INFO
41
 
 
 
 
 
 
 
 
 
42
  # ============== EXTERNAL SERVICES ==============
43
 
44
  # PubMed (optional - higher rate limits)
 
11
  # ANTHROPIC_MODEL=claude-sonnet-4-5-20250929
12
  # OPENAI_MODEL=gpt-5.1
13
 
14
+
15
+ # ============================================
16
+ # Audio Processing Configuration (TTS)
17
+ # ============================================
18
+ # Kokoro TTS Model Configuration
19
+ TTS_MODEL=hexgrad/Kokoro-82M
20
+ TTS_VOICE=af_heart
21
+ TTS_SPEED=1.0
22
+ TTS_GPU=T4
23
+ TTS_TIMEOUT=60
24
+
25
+ # Available TTS Voices:
26
+ # American English Female: af_heart, af_bella, af_nicole, af_aoede, af_kore, af_sarah, af_nova, af_sky, af_alloy, af_jessica, af_river
27
+ # American English Male: am_michael, am_fenrir, am_puck, am_echo, am_eric, am_liam, am_onyx, am_santa, am_adam
28
+
29
+ # Available GPU Types (Modal):
30
+ # T4 - Cheapest, good for testing (default)
31
+ # A10 - Good balance of cost/performance
32
+ # A100 - Fastest, most expensive
33
+ # L4 - NVIDIA L4 GPU
34
+ # L40S - NVIDIA L40S GPU
35
+ # Note: GPU type is set at function definition time. Changes require app restart.
36
+
37
+ # ============================================
38
+ # Audio Processing Configuration (STT)
39
+ # ============================================
40
+ # Speech-to-Text API Configuration
41
+ STT_API_URL=nvidia/canary-1b-v2
42
+ STT_SOURCE_LANG=English
43
+ STT_TARGET_LANG=English
44
+
45
+ # Available STT Languages:
46
+ # English, Bulgarian, Croatian, Czech, Danish, Dutch, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish, Russian, Ukrainian
47
+
48
+ # ============================================
49
+ # Audio Feature Flags
50
+ # ============================================
51
+ ENABLE_AUDIO_INPUT=true
52
+ ENABLE_AUDIO_OUTPUT=true
53
+
54
+ # ============================================
55
+ # Image OCR Configuration
56
+ # ============================================
57
+ OCR_API_URL=prithivMLmods/Multimodal-OCR3
58
+ ENABLE_IMAGE_INPUT=true
59
+
60
  # ============== EMBEDDINGS ==============
61
 
62
  # OpenAI Embedding Model (used if LLM_PROVIDER is openai and performing RAG/Embeddings)
 
85
  SEARCH_TIMEOUT=30
86
  LOG_LEVEL=INFO
87
 
88
+ # ============================================
89
+ # Modal Configuration (Required for TTS)
90
+ # ============================================
91
+ # Modal credentials are required for TTS (Text-to-Speech) functionality
92
+ # Get your credentials from: https://modal.com/
93
+ MODAL_TOKEN_ID=your_modal_token_id_here
94
+ MODAL_TOKEN_SECRET=your_modal_token_secret_here
95
+
96
  # ============== EXTERNAL SERVICES ==============
97
 
98
  # PubMed (optional - higher rate limits)
WEB_SEARCH_TOOL_ASSESSMENT.md ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web Search Tool Assessment
2
+
3
+ ## Executive Summary
4
+
5
+ The application has **two separate web search implementations** with different readiness levels:
6
+
7
+ 1. **`WebSearchTool`** (`src/tools/web_search.py`) - **Partially Ready** ⚠️
8
+ - Functional but **NOT compliant** with `SearchTool` protocol
9
+ - **NOT integrated** into main search handler
10
+ - Only used in magentic orchestrator's retrieval agent
11
+
12
+ 2. **`web_search_adapter`** (`src/tools/web_search_adapter.py`) - **Functional** ✅
13
+ - Used by tool executor for WebSearchAgent tasks
14
+ - Relies on legacy `folder/tools/web_search.py` implementation
15
+
16
+ ## Detailed Analysis
17
+
18
+ ### 1. WebSearchTool (`src/tools/web_search.py`)
19
+
20
+ #### Current Implementation
21
+ - **Location**: `src/tools/web_search.py`
22
+ - **Provider**: DuckDuckGo (no API key required)
23
+ - **Status**: ⚠️ **Partially Ready**
24
+
25
+ #### Issues Identified
26
+
27
+ **❌ Protocol Non-Compliance:**
28
+ ```python
29
+ # Missing required 'name' property
30
+ class WebSearchTool:
31
+ # Should have: @property def name(self) -> str: return "web"
32
+
33
+ # Wrong return type - should return list[Evidence], not SearchResult
34
+ async def search(self, query: str, max_results: int = 10) -> SearchResult:
35
+ # Returns SearchResult instead of list[Evidence]
36
+ ```
37
+
38
+ **Comparison with other tools:**
39
+ - `PubMedTool` has `@property def name(self) -> str: return "pubmed"`
40
+ - `PubMedTool.search()` returns `list[Evidence]`
41
+ - `WebSearchTool` returns `SearchResult` (contains `evidence` list inside)
42
+
43
+ **❌ Not Integrated:**
44
+ - **NOT** included in `SearchHandler` initialization in `src/app.py`:
45
+ ```python
46
+ search_handler = SearchHandler(
47
+ tools=[PubMedTool(), ClinicalTrialsTool(), EuropePMCTool()],
48
+ # WebSearchTool() is missing!
49
+ )
50
+ ```
51
+
52
+ **✅ Current Usage:**
53
+ - Used in `src/agents/retrieval_agent.py` (magentic orchestrator):
54
+ ```python
55
+ from src.tools.web_search import WebSearchTool
56
+ _web_search = WebSearchTool()
57
+ ```
58
+
59
+ #### Fix Required
60
+ To make `WebSearchTool` compliant and usable:
61
+
62
+ 1. **Add `name` property:**
63
+ ```python
64
+ @property
65
+ def name(self) -> str:
66
+ return "web"
67
+ ```
68
+
69
+ 2. **Fix return type:**
70
+ ```python
71
+ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
72
+ # ... existing code ...
73
+ return evidence # Return list[Evidence] directly, not SearchResult
74
+ ```
75
+
76
+ 3. **Register in SearchHandler:**
77
+ ```python
78
+ from src.tools.web_search import WebSearchTool
79
+
80
+ search_handler = SearchHandler(
81
+ tools=[
82
+ PubMedTool(),
83
+ ClinicalTrialsTool(),
84
+ EuropePMCTool(),
85
+ WebSearchTool() # Add this
86
+ ],
87
+ )
88
+ ```
89
+
90
+ ---
91
+
92
+ ### 2. web_search_adapter (`src/tools/web_search_adapter.py`)
93
+
94
+ #### Current Implementation
95
+ - **Location**: `src/tools/web_search_adapter.py`
96
+ - **Status**: ✅ **Functional**
97
+ - **Provider**: Uses legacy `folder/tools/web_search.py` (Serper/SearchXNG)
98
+
99
+ #### Usage
100
+ - Used by `src/tools/tool_executor.py` for `WebSearchAgent` tasks:
101
+ ```python
102
+ if task.agent == "WebSearchAgent":
103
+ result_text = await web_search(task.query)
104
+ ```
105
+
106
+ - Used by `src/orchestrator/planner_agent.py` for background context
107
+
108
+ #### Dependencies
109
+ - Requires `folder/tools/web_search.py` (legacy implementation)
110
+ - Supports Serper API (requires `SERPER_API_KEY`)
111
+ - Supports SearchXNG API (requires `SEARCHXNG_HOST`)
112
+
113
+ #### Limitations
114
+ - Returns formatted string (not `Evidence` objects)
115
+ - Not integrated with `SearchHandler` (different execution path)
116
+ - Depends on legacy folder structure
117
+
118
+ ---
119
+
120
+ ## Integration Status
121
+
122
+ ### SearchHandler Integration
123
+ **Current State**: ❌ **NOT Integrated**
124
+
125
+ The main `SearchHandler` in `src/app.py` only includes:
126
+ - `PubMedTool()`
127
+ - `ClinicalTrialsTool()`
128
+ - `EuropePMCTool()`
129
+
130
+ **WebSearchTool is missing from the main search flow.**
131
+
132
+ ### Tool Executor Integration
133
+ **Current State**: ✅ **Integrated**
134
+
135
+ `web_search_adapter` is used via `tool_executor.py`:
136
+ - Executes when `AgentTask.agent == "WebSearchAgent"`
137
+ - Used in iterative/deep research flows
138
+ - Returns formatted text (not Evidence objects)
139
+
140
+ ### Magentic Orchestrator Integration
141
+ **Current State**: ✅ **Integrated**
142
+
143
+ `WebSearchTool` is used in `retrieval_agent.py`:
144
+ - Direct instantiation: `_web_search = WebSearchTool()`
145
+ - Used via `search_web()` function
146
+ - Updates workflow state with evidence
147
+
148
+ ---
149
+
150
+ ## Can It Be Used?
151
+
152
+ ### WebSearchTool (`src/tools/web_search.py`)
153
+ **Status**: ⚠️ **Can be used, but with limitations**
154
+
155
+ **Can be used:**
156
+ - ✅ In magentic orchestrator (already working)
157
+ - ✅ As standalone tool (functional)
158
+
159
+ **Cannot be used:**
160
+ - ❌ In `SearchHandler` (protocol non-compliance)
161
+ - ❌ In parallel search flows (not registered)
162
+
163
+ **To make fully usable:**
164
+ 1. Fix protocol compliance (add `name`, fix return type)
165
+ 2. Register in `SearchHandler`
166
+ 3. Test integration
167
+
168
+ ### web_search_adapter
169
+ **Status**: ✅ **Can be used**
170
+
171
+ **Can be used:**
172
+ - ✅ Via `tool_executor` for WebSearchAgent tasks
173
+ - ✅ In planner agent for background context
174
+ - ✅ In iterative/deep research flows
175
+
176
+ **Limitations:**
177
+ - Returns string format (not Evidence objects)
178
+ - Requires legacy folder dependencies
179
+ - Different execution path than SearchHandler
180
+
181
+ ---
182
+
183
+ ## Recommendations
184
+
185
+ ### Priority 1: Fix WebSearchTool Protocol Compliance
186
+ Make `WebSearchTool` fully compliant with `SearchTool` protocol:
187
+
188
+ 1. Add `name` property
189
+ 2. Change return type from `SearchResult` to `list[Evidence]`
190
+ 3. Update all callers if needed
191
+
192
+ ### Priority 2: Integrate into SearchHandler
193
+ Add `WebSearchTool` to main search flow:
194
+
195
+ ```python
196
+ from src.tools.web_search import WebSearchTool
197
+
198
+ search_handler = SearchHandler(
199
+ tools=[
200
+ PubMedTool(),
201
+ ClinicalTrialsTool(),
202
+ EuropePMCTool(),
203
+ WebSearchTool() # Add web search
204
+ ],
205
+ )
206
+ ```
207
+
208
+ ### Priority 3: Consolidate Implementations
209
+ Consider consolidating the two implementations:
210
+ - Keep `WebSearchTool` as the main implementation
211
+ - Deprecate or migrate `web_search_adapter` usage
212
+ - Remove dependency on `folder/tools/web_search.py`
213
+
214
+ ### Priority 4: Testing
215
+ Add tests for:
216
+ - Protocol compliance
217
+ - SearchHandler integration
218
+ - Error handling
219
+ - Rate limiting (if needed)
220
+
221
+ ---
222
+
223
+ ## Summary Table
224
+
225
+ | Component | Status | Protocol Compliant | Integrated | Can Be Used |
226
+ |-----------|--------|-------------------|------------|-------------|
227
+ | `WebSearchTool` | ⚠️ Partial | ❌ No | ❌ No | ⚠️ Limited |
228
+ | `web_search_adapter` | ✅ Functional | N/A | ✅ Yes (tool_executor) | ✅ Yes |
229
+
230
+ ---
231
+
232
+ ## Conclusion
233
+
234
+ The web search functionality exists in two forms:
235
+ 1. **`WebSearchTool`** is functional but needs protocol fixes to be fully integrated
236
+ 2. **`web_search_adapter`** is working but uses a different execution path
237
+
238
+ **Recommendation**: Fix `WebSearchTool` protocol compliance and integrate it into `SearchHandler` for unified search capabilities across all orchestrators.
239
+
docs/api/agents.md CHANGED
@@ -268,3 +268,5 @@ def create_input_parser_agent(model: Any | None = None) -> InputParserAgent
268
 
269
 
270
 
 
 
 
268
 
269
 
270
 
271
+
272
+
docs/api/models.md CHANGED
@@ -246,3 +246,5 @@ class BudgetStatus(BaseModel):
246
 
247
 
248
 
 
 
 
246
 
247
 
248
 
249
+
250
+
docs/api/orchestrators.md CHANGED
@@ -193,3 +193,5 @@ Runs Magentic orchestration.
193
 
194
 
195
 
 
 
 
193
 
194
 
195
 
196
+
197
+
docs/api/services.md CHANGED
@@ -199,3 +199,5 @@ Analyzes a hypothesis using statistical methods.
199
 
200
 
201
 
 
 
 
199
 
200
 
201
 
202
+
203
+
docs/api/tools.md CHANGED
@@ -233,3 +233,5 @@ Searches multiple tools in parallel.
233
 
234
 
235
 
 
 
 
233
 
234
 
235
 
236
+
237
+
docs/architecture/agents.md CHANGED
@@ -190,3 +190,5 @@ Factory functions:
190
 
191
 
192
 
 
 
 
190
 
191
 
192
 
193
+
194
+
docs/architecture/middleware.md CHANGED
@@ -140,3 +140,5 @@ All middleware components use `ContextVar` for thread-safe isolation:
140
 
141
 
142
 
 
 
 
140
 
141
 
142
 
143
+
144
+
docs/architecture/services.md CHANGED
@@ -140,3 +140,5 @@ if settings.has_openai_key:
140
 
141
 
142
 
 
 
 
140
 
141
 
142
 
143
+
144
+
docs/architecture/tools.md CHANGED
@@ -173,3 +173,5 @@ search_handler = SearchHandler(
173
 
174
 
175
 
 
 
 
173
 
174
 
175
 
176
+
177
+
docs/contributing/code-quality.md CHANGED
@@ -79,3 +79,5 @@ async def search(self, query: str, max_results: int = 10) -> list[Evidence]:
79
 
80
 
81
 
 
 
 
79
 
80
 
81
 
82
+
83
+
docs/contributing/code-style.md CHANGED
@@ -59,3 +59,5 @@ result = await loop.run_in_executor(None, cpu_bound_function, args)
59
 
60
 
61
 
 
 
 
59
 
60
 
61
 
62
+
63
+
docs/contributing/error-handling.md CHANGED
@@ -67,3 +67,5 @@ except httpx.HTTPError as e:
67
 
68
 
69
 
 
 
 
67
 
68
 
69
 
70
+
71
+
docs/contributing/implementation-patterns.md CHANGED
@@ -82,3 +82,5 @@ def get_embedding_service() -> EmbeddingService:
82
 
83
 
84
 
 
 
 
82
 
83
 
84
 
85
+
86
+
docs/contributing/index.md CHANGED
@@ -161,3 +161,5 @@ Thank you for contributing to DeepCritical!
161
 
162
 
163
 
 
 
 
161
 
162
 
163
 
164
+
165
+
docs/contributing/prompt-engineering.md CHANGED
@@ -67,3 +67,5 @@ This document outlines prompt engineering guidelines and citation validation rul
67
 
68
 
69
 
 
 
 
67
 
68
 
69
 
70
+
71
+
docs/contributing/testing.md CHANGED
@@ -63,3 +63,5 @@ async def test_real_pubmed_search():
63
 
64
 
65
 
 
 
 
63
 
64
 
65
 
66
+
67
+
docs/getting-started/examples.md CHANGED
@@ -207,3 +207,5 @@ USE_GRAPH_EXECUTION=true
207
 
208
 
209
 
 
 
 
207
 
208
 
209
 
210
+
211
+
docs/getting-started/installation.md CHANGED
@@ -146,3 +146,5 @@ uv run pre-commit install
146
 
147
 
148
 
 
 
 
146
 
147
 
148
 
149
+
150
+
docs/getting-started/mcp-integration.md CHANGED
@@ -213,3 +213,5 @@ You can configure multiple DeepCritical instances:
213
 
214
 
215
 
 
 
 
213
 
214
 
215
 
216
+
217
+
docs/getting-started/quick-start.md CHANGED
@@ -117,3 +117,5 @@ What are the active clinical trials investigating Alzheimer's disease treatments
117
 
118
 
119
 
 
 
 
117
 
118
 
119
 
120
+
121
+
docs/implementation/IMPLEMENTATION_SUMMARY.md ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multimodal Audio & Image Integration - Implementation Summary
2
+
3
+ ## ✅ Completed Implementation
4
+
5
+ ### 1. Configuration System (`src/utils/config.py`)
6
+ - ✅ Added audio configuration fields:
7
+ - `tts_model`, `tts_voice`, `tts_speed`, `tts_gpu`, `tts_timeout`
8
+ - `stt_api_url`, `stt_source_lang`, `stt_target_lang`
9
+ - `enable_audio_input`, `enable_audio_output`
10
+ - ✅ Added image OCR configuration:
11
+ - `ocr_api_url`, `enable_image_input`
12
+ - ✅ Added property methods: `audio_available`, `image_ocr_available`
13
+
14
+ ### 2. STT Service (`src/services/stt_gradio.py`)
15
+ - ✅ Gradio Client integration for nvidia/canary-1b-v2
16
+ - ✅ Supports file and numpy array audio input
17
+ - ✅ Async transcription with error handling
18
+ - ✅ Singleton factory pattern
19
+
20
+ ### 3. TTS Service (`src/services/tts_modal.py`)
21
+ - ✅ **Modal GPU function implementation** following Modal documentation
22
+ - ✅ Kokoro 82M integration via Modal GPU
23
+ - ✅ Module-level function definition with lazy initialization
24
+ - ✅ GPU configuration (T4, A10, A100, L4, L40S)
25
+ - ✅ Async wrapper for TTS synthesis
26
+ - ✅ Error handling and graceful degradation
27
+
28
+ ### 4. Image OCR Service (`src/services/image_ocr.py`)
29
+ - ✅ Gradio Client integration for prithivMLmods/Multimodal-OCR3
30
+ - ✅ Supports image files and PIL/numpy arrays
31
+ - ✅ Text extraction from API results
32
+ - ✅ Singleton factory pattern
33
+
34
+ ### 5. Unified Services
35
+ - ✅ `src/services/audio_processing.py` - Audio service layer
36
+ - ✅ `src/services/multimodal_processing.py` - Multimodal service layer
37
+
38
+ ### 6. ChatInterface Integration (`src/app.py`)
39
+ - ✅ Enabled `multimodal=True` for MultimodalTextbox
40
+ - ✅ Added Audio output component
41
+ - ✅ Integrated STT/TTS/OCR into research flow
42
+ - ✅ Multimodal input processing (text + images + audio)
43
+ - ✅ TTS output generation for final responses
44
+ - ✅ **Configuration UI in Settings Accordion**:
45
+ - Voice dropdown (20+ Kokoro voices)
46
+ - Speed slider (0.5x to 2.0x)
47
+ - GPU dropdown (T4, A10, A100, L4, L40S) - read-only, requires restart
48
+ - Enable audio output checkbox
49
+ - ✅ Configuration values passed from UI to TTS service
50
+
51
+ ### 7. MCP Integration (`src/mcp_tools.py`)
52
+ - ✅ Added `extract_text_from_image` MCP tool
53
+ - ✅ Added `transcribe_audio_file` MCP tool
54
+ - ✅ Enabled MCP server in app launch
55
+
56
+ ### 8. Dependencies (`pyproject.toml`)
57
+ - ✅ Added audio dependencies (gradio-client, soundfile, Pillow)
58
+ - ✅ Added TTS optional dependencies (torch, transformers)
59
+ - ✅ Installed via `uv add --optional`
60
+
61
+ ## 🔧 Modal GPU Implementation Details
62
+
63
+ ### Function Definition Pattern
64
+ The Modal GPU function is defined using Modal's recommended pattern:
65
+
66
+ ```python
67
+ @app.function(
68
+ image=tts_image, # Image with Kokoro dependencies
69
+ gpu="T4", # GPU type from settings.tts_gpu
70
+ timeout=60, # Timeout from settings.tts_timeout
71
+ )
72
+ def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
73
+ """Modal GPU function for Kokoro TTS."""
74
+ from kokoro import KModel, KPipeline
75
+ import torch
76
+
77
+ model = KModel().to("cuda").eval()
78
+ pipeline = KPipeline(lang_code=voice[0])
79
+ pack = pipeline.load_voice(voice)
80
+
81
+ for _, ps, _ in pipeline(text, voice, speed):
82
+ ref_s = pack[len(ps) - 1]
83
+ audio = model(ps, ref_s, speed)
84
+ return (24000, audio.numpy())
85
+ ```
86
+
87
+ ### Key Implementation Points
88
+ 1. **Module-Level Definition**: Function defined inside `_setup_modal_function()` but attached to app instance
89
+ 2. **Lazy Initialization**: Function set up on first use
90
+ 3. **GPU Configuration**: Set at function definition time (requires restart to change)
91
+ 4. **Runtime Parameters**: Voice and speed can be changed at runtime via UI
92
+
93
+ ## 🔗 Configuration Flow
94
+
95
+ ### Settings → Implementation
96
+ 1. `settings.tts_voice` → Default voice (used if UI not configured)
97
+ 2. `settings.tts_speed` → Default speed (used if UI not configured)
98
+ 3. `settings.tts_gpu` → GPU type (set at function definition, requires restart)
99
+ 4. `settings.tts_timeout` → Timeout (set at function definition)
100
+
101
+ ### UI → Implementation
102
+ 1. Voice dropdown → `tts_voice` parameter → `AudioService.generate_audio_output()`
103
+ 2. Speed slider → `tts_speed` parameter → `AudioService.generate_audio_output()`
104
+ 3. GPU dropdown → Informational only (changes require restart)
105
+ 4. Enable checkbox → `settings.enable_audio_output` → Controls TTS generation
106
+
107
+ ### Implementation → Modal
108
+ 1. `TTSService.synthesize_async()` → Calls Modal GPU function
109
+ 2. Modal function executes on GPU → Returns audio tuple
110
+ 3. Audio tuple → Gradio Audio component → User hears response
111
+
112
+ ## 📋 Configuration Points in UI
113
+
114
+ ### Settings Accordion Components
115
+ Located in `src/app.py` lines 667-712:
116
+
117
+ 1. **Voice Dropdown** (`tts_voice_dropdown`)
118
+ - 20+ Kokoro voices
119
+ - Default: `settings.tts_voice`
120
+ - Connected to `research_agent()` function
121
+
122
+ 2. **Speed Slider** (`tts_speed_slider`)
123
+ - Range: 0.5 to 2.0
124
+ - Step: 0.1
125
+ - Default: `settings.tts_speed`
126
+ - Connected to `research_agent()` function
127
+
128
+ 3. **GPU Dropdown** (`tts_gpu_dropdown`)
129
+ - Choices: T4, A10, A100, L4, L40S
130
+ - Default: `settings.tts_gpu or "T4"`
131
+ - Read-only (interactive=False)
132
+ - Note: Changes require app restart
133
+
134
+ 4. **Enable Audio Output** (`enable_audio_output_checkbox`)
135
+ - Default: `settings.enable_audio_output`
136
+ - Controls whether TTS is generated
137
+
138
+ ## 🎯 Usage Flow
139
+
140
+ 1. User opens Settings accordion
141
+ 2. Configures TTS voice and speed (optional)
142
+ 3. Submits query (text, image, or audio)
143
+ 4. Research agent processes query
144
+ 5. Final response generated
145
+ 6. If audio output enabled:
146
+ - `AudioService.generate_audio_output()` called
147
+ - Uses UI-configured voice/speed or settings defaults
148
+ - Modal GPU function synthesizes audio
149
+ - Audio displayed in Audio component
150
+
151
+ ## 📝 Notes
152
+
153
+ - **GPU Changes**: GPU type is set at Modal function definition time. Changes to `settings.tts_gpu` or UI dropdown require app restart.
154
+ - **Voice/Speed Changes**: Can be changed at runtime via UI - no restart required.
155
+ - **Graceful Degradation**: If TTS fails, application continues with text-only response.
156
+ - **Modal Credentials**: Required for TTS. If not configured, TTS service unavailable (graceful fallback).
157
+
158
+ ## ✅ Verification Checklist
159
+
160
+ - [x] Modal GPU function correctly defined with `@app.function` decorator
161
+ - [x] GPU parameter set from `settings.tts_gpu`
162
+ - [x] Timeout parameter set from `settings.tts_timeout`
163
+ - [x] Voice parameter passed from UI dropdown
164
+ - [x] Speed parameter passed from UI slider
165
+ - [x] Configuration UI elements in Settings accordion
166
+ - [x] Configuration values connected to implementation
167
+ - [x] Dependencies installed via uv
168
+ - [x] Error handling and graceful degradation
169
+ - [x] MCP tools added for audio/image processing
170
+
171
+ ## 🚀 Next Steps
172
+
173
+ 1. Test TTS with Modal credentials configured
174
+ 2. Verify GPU function execution on Modal
175
+ 3. Test voice and speed changes at runtime
176
+ 4. Add unit tests for services
177
+ 5. Add integration tests for Modal TTS
178
+
179
+
180
+
docs/implementation/TOKEN_AUTHENTICATION_REVIEW.md ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Token Authentication Review - Gradio & HuggingFace
2
+
3
+ ## Summary
4
+
5
+ This document reviews the implementation of token authentication for Gradio Client API calls and HuggingFace API usage to ensure tokens are always passed correctly.
6
+
7
+ ## ✅ Implementation Status
8
+
9
+ ### 1. Gradio Client Services
10
+
11
+ #### STT Service (`src/services/stt_gradio.py`)
12
+ - ✅ **Token Support**: Service accepts `hf_token` parameter in `__init__` and methods
13
+ - ✅ **Client Initialization**: `Client` is created with `hf_token` parameter when token is available
14
+ - ✅ **Token Priority**: Method-level token > instance-level token
15
+ - ✅ **Token Updates**: Client is recreated if token changes
16
+
17
+ **Implementation Pattern:**
18
+ ```python
19
+ async def _get_client(self, hf_token: str | None = None) -> Client:
20
+ token = hf_token or self.hf_token
21
+ if token:
22
+ self.client = Client(self.api_url, hf_token=token)
23
+ else:
24
+ self.client = Client(self.api_url)
25
+ ```
26
+
27
+ #### Image OCR Service (`src/services/image_ocr.py`)
28
+ - ✅ **Token Support**: Service accepts `hf_token` parameter in `__init__` and methods
29
+ - ✅ **Client Initialization**: `Client` is created with `hf_token` parameter when token is available
30
+ - ✅ **Token Priority**: Method-level token > instance-level token
31
+ - ✅ **Token Updates**: Client is recreated if token changes
32
+
33
+ **Same pattern as STT Service**
34
+
35
+ ### 2. Service Layer Integration
36
+
37
+ #### Audio Service (`src/services/audio_processing.py`)
38
+ - ✅ **Token Passthrough**: `process_audio_input()` accepts `hf_token` and passes to STT service
39
+ - ✅ **Token Flow**: `audio_service.process_audio_input(audio, hf_token=token)`
40
+
41
+ #### Multimodal Service (`src/services/multimodal_processing.py`)
42
+ - ✅ **Token Passthrough**: `process_multimodal_input()` accepts `hf_token` and passes to both audio and OCR services
43
+ - ✅ **Token Flow**: `multimodal_service.process_multimodal_input(..., hf_token=token)`
44
+
45
+ ### 3. Application Layer (`src/app.py`)
46
+
47
+ #### Token Extraction
48
+ - ✅ **OAuth Token**: Extracted from `gr.OAuthToken` via `oauth_token.token`
49
+ - ✅ **Fallback**: Uses `HF_TOKEN` or `HUGGINGFACE_API_KEY` from environment
50
+ - ✅ **Token Priority**: `oauth_token > HF_TOKEN > HUGGINGFACE_API_KEY`
51
+
52
+ **Implementation:**
53
+ ```python
54
+ token_value: str | None = None
55
+ if oauth_token is not None:
56
+ token_value = oauth_token.token if hasattr(oauth_token, "token") else None
57
+
58
+ # Fallback to env vars
59
+ effective_token = token_value or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
60
+ ```
61
+
62
+ #### Token Usage in Services
63
+ - ✅ **Multimodal Processing**: Token passed to `process_multimodal_input(..., hf_token=token_value)`
64
+ - ✅ **Consistent Usage**: Token is extracted once and passed through all service layers
65
+
66
+ ### 4. HuggingFace API Integration
67
+
68
+ #### LLM Factory (`src/utils/llm_factory.py`)
69
+ - ✅ **Token Priority**: `oauth_token > settings.hf_token > settings.huggingface_api_key`
70
+ - ✅ **Provider Usage**: `HuggingFaceProvider(api_key=effective_hf_token)`
71
+ - ✅ **Model Usage**: `HuggingFaceModel(model_name, provider=provider)`
72
+
73
+ #### Judge Handler (`src/agent_factory/judges.py`)
74
+ - ✅ **Token Priority**: `oauth_token > settings.hf_token > settings.huggingface_api_key`
75
+ - ✅ **InferenceClient**: `InferenceClient(api_key=api_key)` when token provided
76
+ - ✅ **Fallback**: Uses `HF_TOKEN` from environment if no token provided
77
+
78
+ **Implementation:**
79
+ ```python
80
+ effective_hf_token = oauth_token or settings.hf_token or settings.huggingface_api_key
81
+ hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
82
+ ```
83
+
84
+ ### 5. MCP Tools (`src/mcp_tools.py`)
85
+
86
+ #### Image OCR Tool
87
+ - ✅ **Token Support**: `extract_text_from_image()` accepts `hf_token` parameter
88
+ - ✅ **Token Fallback**: Uses `settings.hf_token` or `settings.huggingface_api_key` if not provided
89
+ - ✅ **Service Integration**: Passes token to `ImageOCRService.extract_text()`
90
+
91
+ #### Audio Transcription Tool
92
+ - ✅ **Token Support**: `transcribe_audio_file()` accepts `hf_token` parameter
93
+ - ✅ **Token Fallback**: Uses `settings.hf_token` or `settings.huggingface_api_key` if not provided
94
+ - ✅ **Service Integration**: Passes token to `STTService.transcribe_file()`
95
+
96
+ ## Token Flow Diagram
97
+
98
+ ```
99
+ User Login (OAuth)
100
+
101
+ oauth_token.token
102
+
103
+ app.py: token_value
104
+
105
+ ┌─────────────────────────────────────┐
106
+ │ Service Layer │
107
+ ├─────────────────────────────────────┤
108
+ │ MultimodalService │
109
+ │ ↓ hf_token=token_value │
110
+ │ AudioService │
111
+ │ ↓ hf_token=token_value │
112
+ │ STTService / ImageOCRService │
113
+ │ ↓ hf_token=token_value │
114
+ │ Gradio Client(hf_token=token) │
115
+ └─────────────────────────────────────┘
116
+
117
+ Alternative: Environment Variables
118
+
119
+ HF_TOKEN or HUGGINGFACE_API_KEY
120
+
121
+ settings.hf_token or settings.huggingface_api_key
122
+
123
+ Same service flow as above
124
+ ```
125
+
126
+ ## Verification Checklist
127
+
128
+ - [x] STT Service accepts and uses `hf_token` parameter
129
+ - [x] Image OCR Service accepts and uses `hf_token` parameter
130
+ - [x] Audio Service passes token to STT service
131
+ - [x] Multimodal Service passes token to both audio and OCR services
132
+ - [x] App.py extracts OAuth token correctly
133
+ - [x] App.py passes token to multimodal service
134
+ - [x] HuggingFace API calls use token via `HuggingFaceProvider`
135
+ - [x] HuggingFace API calls use token via `InferenceClient`
136
+ - [x] MCP tools accept and use token parameter
137
+ - [x] Token priority is consistent: OAuth > Env Vars
138
+ - [x] Fallback to environment variables when OAuth not available
139
+
140
+ ## Token Parameter Naming
141
+
142
+ All services consistently use `hf_token` parameter name:
143
+ - `STTService.transcribe_audio(..., hf_token=...)`
144
+ - `STTService.transcribe_file(..., hf_token=...)`
145
+ - `ImageOCRService.extract_text(..., hf_token=...)`
146
+ - `ImageOCRService.extract_text_from_image(..., hf_token=...)`
147
+ - `AudioService.process_audio_input(..., hf_token=...)`
148
+ - `MultimodalService.process_multimodal_input(..., hf_token=...)`
149
+ - `extract_text_from_image(..., hf_token=...)` (MCP tool)
150
+ - `transcribe_audio_file(..., hf_token=...)` (MCP tool)
151
+
152
+ ## Gradio Client API Usage
153
+
154
+ According to Gradio documentation, the `Client` constructor accepts:
155
+ ```python
156
+ Client(space_name, hf_token=None)
157
+ ```
158
+
159
+ Our implementation correctly uses:
160
+ ```python
161
+ Client(self.api_url, hf_token=token) # When token available
162
+ Client(self.api_url) # When no token (public Space)
163
+ ```
164
+
165
+ ## HuggingFace API Usage
166
+
167
+ ### HuggingFaceProvider
168
+ ```python
169
+ HuggingFaceProvider(api_key=effective_hf_token)
170
+ ```
171
+ ✅ Correctly passes token as `api_key` parameter
172
+
173
+ ### InferenceClient
174
+ ```python
175
+ InferenceClient(api_key=api_key) # When token provided
176
+ InferenceClient() # Falls back to HF_TOKEN env var
177
+ ```
178
+ ✅ Correctly passes token as `api_key` parameter
179
+
180
+ ## Edge Cases Handled
181
+
182
+ 1. **No Token Available**: Services work without token (public Gradio Spaces)
183
+ 2. **Token Changes**: Client is recreated when token changes
184
+ 3. **OAuth vs Env**: OAuth token takes priority over environment variables
185
+ 4. **Multiple Token Sources**: Consistent priority across all services
186
+ 5. **MCP Tools**: Support both explicit token and fallback to settings
187
+
188
+ ## Recommendations
189
+
190
+ ✅ **All implementations are correct and consistent**
191
+
192
+ The token authentication is properly implemented throughout:
193
+ - Gradio Client services accept and use tokens
194
+ - Service layer passes tokens through correctly
195
+ - Application layer extracts and passes OAuth tokens
196
+ - HuggingFace API calls use tokens via correct parameters
197
+ - MCP tools support token authentication
198
+ - Token priority is consistent across all layers
199
+
200
+ No changes needed - implementation follows best practices.
201
+
docs/implementation/TTS_MODAL_IMPLEMENTATION.md ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TTS Modal GPU Implementation
2
+
3
+ ## Overview
4
+
5
+ The TTS (Text-to-Speech) service uses Kokoro 82M model running on Modal's GPU infrastructure. This document describes the implementation details and configuration.
6
+
7
+ ## Implementation Details
8
+
9
+ ### Modal GPU Function Pattern
10
+
11
+ The implementation follows Modal's recommended pattern for GPU functions:
12
+
13
+ 1. **Module-Level Function Definition**: Modal functions must be defined at module level and attached to an app instance
14
+ 2. **Lazy Initialization**: The function is set up on first use via `_setup_modal_function()`
15
+ 3. **GPU Configuration**: GPU type is set at function definition time (requires app restart to change)
16
+
17
+ ### Key Files
18
+
19
+ - `src/services/tts_modal.py` - Modal GPU executor for Kokoro TTS
20
+ - `src/services/audio_processing.py` - Unified audio service wrapper
21
+ - `src/utils/config.py` - Configuration settings
22
+ - `src/app.py` - UI integration with settings accordion
23
+
24
+ ### Configuration Options
25
+
26
+ All TTS configuration is available in `src/utils/config.py`:
27
+
28
+ ```python
29
+ tts_model: str = "hexgrad/Kokoro-82M" # Model ID
30
+ tts_voice: str = "af_heart" # Voice ID
31
+ tts_speed: float = 1.0 # Speed multiplier (0.5-2.0)
32
+ tts_gpu: str = "T4" # GPU type (T4, A10, A100, etc.)
33
+ tts_timeout: int = 60 # Timeout in seconds
34
+ enable_audio_output: bool = True # Enable/disable TTS
35
+ ```
36
+
37
+ ### UI Configuration
38
+
39
+ TTS settings are available in the Settings accordion:
40
+
41
+ - **Voice Dropdown**: Select from 20+ Kokoro voices (af_heart, af_bella, am_michael, etc.)
42
+ - **Speed Slider**: Adjust speech speed (0.5x to 2.0x)
43
+ - **GPU Dropdown**: Select GPU type (T4, A10, A100, L4, L40S) - visible only if Modal credentials configured
44
+ - **Enable Audio Output**: Toggle TTS generation
45
+
46
+ ### Modal Function Implementation
47
+
48
+ The Modal GPU function is defined as:
49
+
50
+ ```python
51
+ @app.function(
52
+ image=tts_image, # Image with Kokoro dependencies
53
+ gpu="T4", # GPU type (from settings.tts_gpu)
54
+ timeout=60, # Timeout (from settings.tts_timeout)
55
+ )
56
+ def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
57
+ """Modal GPU function for Kokoro TTS."""
58
+ from kokoro import KModel, KPipeline
59
+ import torch
60
+
61
+ model = KModel().to("cuda").eval()
62
+ pipeline = KPipeline(lang_code=voice[0])
63
+ pack = pipeline.load_voice(voice)
64
+
65
+ for _, ps, _ in pipeline(text, voice, speed):
66
+ ref_s = pack[len(ps) - 1]
67
+ audio = model(ps, ref_s, speed)
68
+ return (24000, audio.numpy())
69
+ ```
70
+
71
+ ### Usage Flow
72
+
73
+ 1. User submits query with audio output enabled
74
+ 2. Research agent processes query and generates text response
75
+ 3. `AudioService.generate_audio_output()` is called with:
76
+ - Response text
77
+ - Voice (from UI dropdown or settings default)
78
+ - Speed (from UI slider or settings default)
79
+ 4. `TTSService.synthesize_async()` calls Modal GPU function
80
+ 5. Modal executes Kokoro TTS on GPU
81
+ 6. Audio tuple `(sample_rate, audio_array)` is returned
82
+ 7. Audio is displayed in Gradio Audio component
83
+
84
+ ### Dependencies
85
+
86
+ Installed via `uv add --optional`:
87
+ - `gradio-client>=1.0.0` - For STT/OCR API calls
88
+ - `soundfile>=0.12.0` - For audio file I/O
89
+ - `Pillow>=10.0.0` - For image processing
90
+
91
+ Kokoro is installed in Modal image from source:
92
+ - `git+https://github.com/hexgrad/kokoro.git`
93
+
94
+ ### GPU Types
95
+
96
+ Modal supports various GPU types:
97
+ - **T4**: Cheapest, good for testing (default)
98
+ - **A10**: Good balance of cost/performance
99
+ - **A100**: Fastest, most expensive
100
+ - **L4**: NVIDIA L4 GPU
101
+ - **L40S**: NVIDIA L40S GPU
102
+
103
+ **Note**: GPU type is set at function definition time. Changes to `settings.tts_gpu` require app restart.
104
+
105
+ ### Error Handling
106
+
107
+ - If Modal credentials not configured: TTS service unavailable (graceful degradation)
108
+ - If Kokoro import fails: ConfigurationError raised
109
+ - If synthesis fails: Returns None, logs warning, continues without audio
110
+ - If GPU unavailable: Modal will queue or fail with clear error message
111
+
112
+ ### Configuration Connection
113
+
114
+ 1. **Settings → Implementation**: `settings.tts_voice`, `settings.tts_speed` used as defaults
115
+ 2. **UI → Implementation**: UI dropdowns/sliders passed to `research_agent()` function
116
+ 3. **Implementation → Modal**: Voice and speed passed to Modal GPU function
117
+ 4. **GPU Configuration**: Set at function definition time (requires restart to change)
118
+
119
+ ### Testing
120
+
121
+ To test TTS:
122
+ 1. Ensure Modal credentials configured (`MODAL_TOKEN_ID`, `MODAL_TOKEN_SECRET`)
123
+ 2. Enable audio output in settings
124
+ 3. Submit a query
125
+ 4. Check audio output component for generated speech
126
+
127
+ ### References
128
+
129
+ - [Kokoro TTS Space](https://huggingface.co/spaces/hexgrad/Kokoro-TTS) - Reference implementation
130
+ - [Modal GPU Documentation](https://modal.com/docs/guide/gpu) - Modal GPU usage
131
+ - [Kokoro GitHub](https://github.com/hexgrad/kokoro) - Source code
132
+
133
+
134
+
docs/license.md CHANGED
@@ -37,3 +37,5 @@ SOFTWARE.
37
 
38
 
39
 
 
 
 
37
 
38
 
39
 
40
+
41
+
docs/overview/architecture.md CHANGED
@@ -194,3 +194,5 @@ The system supports complex research workflows through:
194
 
195
 
196
 
 
 
 
194
 
195
 
196
 
197
+
198
+
docs/overview/features.md CHANGED
@@ -146,3 +146,5 @@ DeepCritical provides a comprehensive set of features for AI-assisted research:
146
 
147
 
148
 
 
 
 
146
 
147
 
148
 
149
+
150
+
docs/team.md CHANGED
@@ -42,3 +42,5 @@ We welcome contributions! See the [Contributing Guide](contributing/index.md) fo
42
 
43
 
44
 
 
 
 
42
 
43
 
44
 
45
+
46
+
new_env.txt ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================
2
+ # DeepCritical - New Environment Variables
3
+ # ============================================
4
+ # Add these to your .env file for multimodal audio/image support
5
+ # ============================================
6
+
7
+ # ============================================
8
+ # Audio Processing Configuration (TTS)
9
+ # ============================================
10
+ # Kokoro TTS Model Configuration
11
+ TTS_MODEL=hexgrad/Kokoro-82M
12
+ TTS_VOICE=af_heart
13
+ TTS_SPEED=1.0
14
+ TTS_GPU=T4
15
+ TTS_TIMEOUT=60
16
+
17
+ # Available TTS Voices:
18
+ # American English Female: af_heart, af_bella, af_nicole, af_aoede, af_kore, af_sarah, af_nova, af_sky, af_alloy, af_jessica, af_river
19
+ # American English Male: am_michael, am_fenrir, am_puck, am_echo, am_eric, am_liam, am_onyx, am_santa, am_adam
20
+
21
+ # Available GPU Types (Modal):
22
+ # T4 - Cheapest, good for testing (default)
23
+ # A10 - Good balance of cost/performance
24
+ # A100 - Fastest, most expensive
25
+ # L4 - NVIDIA L4 GPU
26
+ # L40S - NVIDIA L40S GPU
27
+ # Note: GPU type is set at function definition time. Changes require app restart.
28
+
29
+ # ============================================
30
+ # Audio Processing Configuration (STT)
31
+ # ============================================
32
+ # Speech-to-Text API Configuration
33
+ STT_API_URL=nvidia/canary-1b-v2
34
+ STT_SOURCE_LANG=English
35
+ STT_TARGET_LANG=English
36
+
37
+ # Available STT Languages:
38
+ # English, Bulgarian, Croatian, Czech, Danish, Dutch, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish, Russian, Ukrainian
39
+
40
+ # ============================================
41
+ # Audio Feature Flags
42
+ # ============================================
43
+ ENABLE_AUDIO_INPUT=true
44
+ ENABLE_AUDIO_OUTPUT=true
45
+
46
+ # ============================================
47
+ # Image OCR Configuration
48
+ # ============================================
49
+ OCR_API_URL=prithivMLmods/Multimodal-OCR3
50
+ ENABLE_IMAGE_INPUT=true
51
+
52
+ # ============================================
53
+ # Modal Configuration (Required for TTS)
54
+ # ============================================
55
+ # Modal credentials are required for TTS (Text-to-Speech) functionality
56
+ # Get your credentials from: https://modal.com/
57
+ MODAL_TOKEN_ID=your_modal_token_id_here
58
+ MODAL_TOKEN_SECRET=your_modal_token_secret_here
59
+
60
+ # ============================================
61
+ # Existing Environment Variables (for reference)
62
+ # ============================================
63
+ # These are already documented elsewhere, but included for completeness:
64
+
65
+ # LLM API Keys (for research agent)
66
+ # OPENAI_API_KEY=your_openai_key
67
+ # ANTHROPIC_API_KEY=your_anthropic_key
68
+ # HF_TOKEN=your_huggingface_token
69
+ # HUGGINGFACE_API_KEY=your_huggingface_key
70
+
71
+ # Embedding Configuration
72
+ # OPENAI_EMBEDDING_MODEL=text-embedding-3-small
73
+ # LOCAL_EMBEDDING_MODEL=all-MiniLM-L6-v2
74
+ # EMBEDDING_PROVIDER=local
75
+
76
+ # Search Configuration
77
+ # WEB_SEARCH_PROVIDER=duckduckgo
78
+ # SERPER_API_KEY=your_serper_key
79
+ # BRAVE_API_KEY=your_brave_key
80
+ # TAVILY_API_KEY=your_tavily_key
81
+
82
+ # PubMed Configuration
83
+ # NCBI_API_KEY=your_ncbi_key
84
+
85
+ # ============================================
86
+ # Usage Instructions
87
+ # ============================================
88
+ # 1. Copy the variables you need to your .env file
89
+ # 2. Replace placeholder values (your_modal_token_id_here, etc.) with actual credentials
90
+ # 3. For TTS to work, you MUST configure MODAL_TOKEN_ID and MODAL_TOKEN_SECRET
91
+ # 4. STT and OCR work without additional API keys (use public Gradio Spaces)
92
+ # 5. GPU type changes require app restart to take effect
93
+ # 6. Voice and speed can be changed at runtime via UI Settings accordion
94
+
95
+
96
+
pyproject.toml CHANGED
@@ -5,21 +5,16 @@ description = "AI-Native Drug Repurposing Research Agent"
5
  readme = "README.md"
6
  requires-python = ">=3.11"
7
  dependencies = [
8
- # Core
9
  "pydantic>=2.7",
10
  "pydantic-settings>=2.2", # For BaseSettings (config)
11
  "pydantic-ai>=0.0.16", # Agent framework
12
- # AI Providers
13
  "openai>=1.0.0",
14
  "anthropic>=0.18.0",
15
- # HTTP & Parsing
16
  "httpx>=0.27", # Async HTTP client (PubMed)
17
  "beautifulsoup4>=4.12", # HTML parsing
18
  "xmltodict>=0.13", # PubMed XML -> dict
19
  "huggingface-hub>=0.20.0", # Hugging Face Inference API
20
- # UI
21
  "gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
22
- # Utils
23
  "python-dotenv>=1.0", # .env loading
24
  "tenacity>=8.2", # Retry logic
25
  "structlog>=24.1", # Structured logging
@@ -31,6 +26,15 @@ dependencies = [
31
  "llama-index-llms-huggingface-api>=0.6.1",
32
  "llama-index-vector-stores-chroma>=0.5.3",
33
  "llama-index>=0.14.8",
 
 
 
 
 
 
 
 
 
34
  ]
35
 
36
  [project.optional-dependencies]
@@ -66,7 +70,7 @@ embeddings = [
66
  ]
67
  modal = [
68
  # Mario's Modal code execution + LlamaIndex RAG
69
- "modal>=0.63.0",
70
  "llama-index>=0.11.0",
71
  "llama-index-llms-openai",
72
  "llama-index-embeddings-openai",
 
5
  readme = "README.md"
6
  requires-python = ">=3.11"
7
  dependencies = [
 
8
  "pydantic>=2.7",
9
  "pydantic-settings>=2.2", # For BaseSettings (config)
10
  "pydantic-ai>=0.0.16", # Agent framework
 
11
  "openai>=1.0.0",
12
  "anthropic>=0.18.0",
 
13
  "httpx>=0.27", # Async HTTP client (PubMed)
14
  "beautifulsoup4>=4.12", # HTML parsing
15
  "xmltodict>=0.13", # PubMed XML -> dict
16
  "huggingface-hub>=0.20.0", # Hugging Face Inference API
 
17
  "gradio[mcp]>=6.0.0", # Chat interface with MCP server support (6.0 required for css in launch())
 
18
  "python-dotenv>=1.0", # .env loading
19
  "tenacity>=8.2", # Retry logic
20
  "structlog>=24.1", # Structured logging
 
26
  "llama-index-llms-huggingface-api>=0.6.1",
27
  "llama-index-vector-stores-chroma>=0.5.3",
28
  "llama-index>=0.14.8",
29
+ # Audio/Image processing
30
+ "gradio-client>=1.0.0", # For STT/OCR API calls
31
+ "soundfile>=0.12.0", # For audio file I/O
32
+ "pillow>=10.0.0", # For image processing
33
+ # TTS dependencies (for Modal GPU TTS)
34
+ "torch>=2.0.0", # Required by Kokoro TTS
35
+ "transformers>=4.30.0", # Required by Kokoro TTS
36
+ "modal>=0.63.0", # Required for TTS GPU execution
37
+ # Note: Kokoro is installed in Modal image from: git+https://github.com/hexgrad/kokoro.git
38
  ]
39
 
40
  [project.optional-dependencies]
 
70
  ]
71
  modal = [
72
  # Mario's Modal code execution + LlamaIndex RAG
73
+ # Note: modal>=0.63.0 is now in main dependencies for TTS support
74
  "llama-index>=0.11.0",
75
  "llama-index-llms-openai",
76
  "llama-index-embeddings-openai",
src/agent_factory/judges.py CHANGED
@@ -26,22 +26,28 @@ from src.utils.models import AssessmentDetails, Evidence, JudgeAssessment
26
  logger = structlog.get_logger()
27
 
28
 
29
- def get_model() -> Any:
30
  """Get the LLM model based on configuration.
31
 
32
  Explicitly passes API keys from settings to avoid requiring
33
  users to export environment variables manually.
 
 
 
34
  """
35
  llm_provider = settings.llm_provider
36
 
 
 
 
37
  if llm_provider == "anthropic":
38
  provider = AnthropicProvider(api_key=settings.anthropic_api_key)
39
  return AnthropicModel(settings.anthropic_model, provider=provider)
40
 
41
  if llm_provider == "huggingface":
42
- # Free tier - uses HF_TOKEN from environment if available
43
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
44
- hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
45
  return HuggingFaceModel(model_name, provider=hf_provider)
46
 
47
  if llm_provider == "openai":
@@ -53,7 +59,7 @@ def get_model() -> Any:
53
  logger.warning("Unknown LLM provider, defaulting to HuggingFace", provider=llm_provider)
54
 
55
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
56
- hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
57
  return HuggingFaceModel(model_name, provider=hf_provider)
58
 
59
 
@@ -176,16 +182,19 @@ class HFInferenceJudgeHandler:
176
  "HuggingFaceH4/zephyr-7b-beta", # Fallback (Ungated)
177
  ]
178
 
179
- def __init__(self, model_id: str | None = None) -> None:
 
 
180
  """
181
  Initialize with HF Inference client.
182
 
183
  Args:
184
  model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
 
185
  """
186
  self.model_id = model_id
187
- # Will automatically use HF_TOKEN from env if available
188
- self.client = InferenceClient()
189
  self.call_count = 0
190
  self.last_question: str | None = None
191
  self.last_evidence: list[Evidence] | None = None
 
26
  logger = structlog.get_logger()
27
 
28
 
29
+ def get_model(oauth_token: str | None = None) -> Any:
30
  """Get the LLM model based on configuration.
31
 
32
  Explicitly passes API keys from settings to avoid requiring
33
  users to export environment variables manually.
34
+
35
+ Args:
36
+ oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
37
  """
38
  llm_provider = settings.llm_provider
39
 
40
+ # Priority: oauth_token > env vars
41
+ effective_hf_token = oauth_token or settings.hf_token or settings.huggingface_api_key
42
+
43
  if llm_provider == "anthropic":
44
  provider = AnthropicProvider(api_key=settings.anthropic_api_key)
45
  return AnthropicModel(settings.anthropic_model, provider=provider)
46
 
47
  if llm_provider == "huggingface":
48
+ # Free tier - uses OAuth token or HF_TOKEN from environment if available
49
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
50
+ hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
51
  return HuggingFaceModel(model_name, provider=hf_provider)
52
 
53
  if llm_provider == "openai":
 
59
  logger.warning("Unknown LLM provider, defaulting to HuggingFace", provider=llm_provider)
60
 
61
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
62
+ hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
63
  return HuggingFaceModel(model_name, provider=hf_provider)
64
 
65
 
 
182
  "HuggingFaceH4/zephyr-7b-beta", # Fallback (Ungated)
183
  ]
184
 
185
+ def __init__(
186
+ self, model_id: str | None = None, api_key: str | None = None
187
+ ) -> None:
188
  """
189
  Initialize with HF Inference client.
190
 
191
  Args:
192
  model_id: Optional specific model ID. If None, uses FALLBACK_MODELS chain.
193
+ api_key: Optional HuggingFace API key/token. If None, uses HF_TOKEN from env.
194
  """
195
  self.model_id = model_id
196
+ # Pass api_key to InferenceClient if provided, otherwise it will use HF_TOKEN from env
197
+ self.client = InferenceClient(api_key=api_key) if api_key else InferenceClient()
198
  self.call_count = 0
199
  self.last_question: str | None = None
200
  self.last_evidence: list[Evidence] | None = None
src/app.py CHANGED
@@ -5,6 +5,8 @@ from collections.abc import AsyncGenerator
5
  from typing import Any
6
 
7
  import gradio as gr
 
 
8
 
9
  # Try to import HuggingFace support (may not be available in all pydantic-ai versions)
10
  # According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
@@ -26,6 +28,8 @@ except ImportError:
26
 
27
  from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
28
  from src.orchestrator_factory import create_orchestrator
 
 
29
  from src.tools.clinicaltrials import ClinicalTrialsTool
30
  from src.tools.europepmc import EuropePMCTool
31
  from src.tools.pubmed import PubMedTool
@@ -40,16 +44,20 @@ def configure_orchestrator(
40
  oauth_token: str | None = None,
41
  hf_model: str | None = None,
42
  hf_provider: str | None = None,
 
 
43
  ) -> tuple[Any, str]:
44
  """
45
  Create an orchestrator instance.
46
 
47
  Args:
48
  use_mock: If True, use MockJudgeHandler (no API key needed)
49
- mode: Orchestrator mode ("simple" or "advanced")
50
  oauth_token: Optional OAuth token from HuggingFace login
51
  hf_model: Selected HuggingFace model ID
52
  hf_provider: Selected inference provider
 
 
53
 
54
  Returns:
55
  Tuple of (Orchestrator instance, backend_name)
@@ -60,10 +68,14 @@ def configure_orchestrator(
60
  max_results_per_tool=10,
61
  )
62
 
63
- # Create search tools
 
64
  search_handler = SearchHandler(
65
  tools=[PubMedTool(), ClinicalTrialsTool(), EuropePMCTool()],
66
  timeout=config.search_timeout,
 
 
 
67
  )
68
 
69
  # Create judge (mock, real, or free tier)
@@ -109,22 +121,30 @@ def configure_orchestrator(
109
  # 3. Free Tier (HuggingFace Inference) - NO API KEY AVAILABLE
110
  else:
111
  # No API key available - use HFInferenceJudgeHandler with public models
112
- # Don't use third-party providers (novita, groq, etc.) as they require their own API keys
113
- # Use HuggingFace's own inference API with public/ungated models
114
- # Pass empty provider to use HuggingFace's default (not third-party providers)
115
  judge_handler = HFInferenceJudgeHandler(
116
- model_id=hf_model,
117
- api_key=None, # No API key - will use public models only
118
- provider=None, # Don't specify provider - use HuggingFace's default
119
  )
120
  model_display = hf_model.split("/")[-1] if hf_model else "Default (Public Models)"
121
  backend_info = f"Free Tier ({model_display} - Public Models Only)"
122
 
 
 
 
 
 
 
 
 
 
123
  orchestrator = create_orchestrator(
124
  search_handler=search_handler,
125
  judge_handler=judge_handler,
126
  config=config,
127
- mode=mode, # type: ignore
128
  )
129
 
130
  return orchestrator, backend_info
@@ -405,19 +425,23 @@ async def handle_orchestrator_events(
405
 
406
 
407
  async def research_agent(
408
- message: str,
409
  history: list[dict[str, Any]],
410
  mode: str = "simple",
411
  hf_model: str | None = None,
412
  hf_provider: str | None = None,
 
 
 
 
413
  oauth_token: gr.OAuthToken | None = None,
414
  oauth_profile: gr.OAuthProfile | None = None,
415
- ) -> AsyncGenerator[dict[str, Any] | list[dict[str, Any]], None]:
416
  """
417
  Gradio chat function that runs the research agent.
418
 
419
  Args:
420
- message: User's research question
421
  history: Chat history (Gradio format)
422
  mode: Orchestrator mode ("simple" or "advanced")
423
  hf_model: Selected HuggingFace model ID (from dropdown)
@@ -426,8 +450,12 @@ async def research_agent(
426
  oauth_profile: Gradio OAuth profile (None if user not logged in)
427
 
428
  Yields:
429
- ChatMessage objects with metadata for accordion display
430
  """
 
 
 
 
431
  # REQUIRE LOGIN BEFORE USE
432
  # Extract OAuth token and username using Gradio's OAuth types
433
  # According to Gradio docs: OAuthToken and OAuthProfile are None if user not logged in
@@ -465,14 +493,37 @@ async def research_agent(
465
  "before using this application.\n\n"
466
  "The login button is required to access the AI models and research tools."
467
  ),
468
- }
469
  return
470
 
471
- if not message.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  yield {
473
  "role": "assistant",
474
- "content": "Please enter a research question.",
475
- }
476
  return
477
 
478
  # Check available keys (use token_value instead of oauth_token)
@@ -501,6 +552,8 @@ async def research_agent(
501
  oauth_token=token_value, # Use extracted token value
502
  hf_model=model_id, # None will use defaults in configure_orchestrator
503
  hf_provider=provider_name, # None will use defaults in configure_orchestrator
 
 
504
  )
505
 
506
  yield {
@@ -508,9 +561,41 @@ async def research_agent(
508
  "content": f"🧠 **Backend**: {backend_name}\n\n",
509
  }
510
 
511
- # Handle orchestrator events
512
- async for msg in handle_orchestrator_events(orchestrator, message):
513
- yield msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
  except Exception as e:
516
  # Return error message without metadata to avoid issues during example caching
@@ -521,7 +606,7 @@ async def research_agent(
521
  yield {
522
  "role": "assistant",
523
  "content": f"Error: {error_msg}. Please check your configuration and try again.",
524
- }
525
 
526
 
527
  def create_demo() -> gr.Blocks:
@@ -566,6 +651,72 @@ def create_demo() -> gr.Blocks:
566
  ),
567
  )
568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  # Hidden text components for model/provider (not dropdowns to avoid value mismatch)
570
  # These will be empty by default and use defaults in configure_orchestrator
571
  with gr.Row(visible=False):
@@ -581,11 +732,18 @@ def create_demo() -> gr.Blocks:
581
  visible=False, # Hidden from UI
582
  )
583
 
584
- # Chat interface with model/provider selection
 
 
 
 
 
 
585
  # Examples are provided but will NOT run at startup (cache_examples=False)
586
  # Users must log in first before using examples or submitting queries
587
  gr.ChatInterface(
588
  fn=research_agent,
 
589
  title="🧬 DeepCritical",
590
  description=(
591
  "*AI-Powered Drug Repurposing Agent — searches PubMed, "
@@ -593,6 +751,7 @@ def create_demo() -> gr.Blocks:
593
  "---\n"
594
  "*Research tool only — not for medical advice.* \n"
595
  "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
 
596
  "**⚠️ Authentication Required**: Please **sign in with HuggingFace** above before using this application."
597
  ),
598
  examples=[
@@ -606,18 +765,24 @@ def create_demo() -> gr.Blocks:
606
  "simple",
607
  "Qwen/Qwen3-Next-80B-A3B-Thinking",
608
  "",
 
 
609
  ],
610
  [
611
  "Is metformin effective for treating cancer? Investigate mechanism of action.",
612
  "iterative",
613
  "Qwen/Qwen3-235B-A22B-Instruct-2507",
614
  "",
 
 
615
  ],
616
  [
617
  "Create a comprehensive report on Long COVID treatments including clinical trials, mechanisms, and safety.",
618
  "deep",
619
  "zai-org/GLM-4.5-Air",
620
  "nebius",
 
 
621
  ],
622
  ],
623
  cache_examples=False, # CRITICAL: Disable example caching to prevent examples from running at startup
@@ -627,9 +792,14 @@ def create_demo() -> gr.Blocks:
627
  mode_radio,
628
  hf_model_dropdown,
629
  hf_provider_dropdown,
 
 
 
 
630
  # Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
631
  # when user is logged in - they should NOT be added to additional_inputs
632
  ],
 
633
  )
634
 
635
  return demo # type: ignore[no-any-return]
@@ -642,7 +812,7 @@ def main() -> None:
642
  # server_name="0.0.0.0",
643
  # server_port=7860,
644
  # share=False,
645
- mcp_server=False,
646
  ssr_mode=False, # Fix for intermittent loading/hydration issues in HF Spaces
647
  )
648
 
 
5
  from typing import Any
6
 
7
  import gradio as gr
8
+ import numpy as np
9
+ from gradio.components.multimodal_textbox import MultimodalPostprocess
10
 
11
  # Try to import HuggingFace support (may not be available in all pydantic-ai versions)
12
  # According to https://ai.pydantic.dev/models/huggingface/, HuggingFace support requires
 
28
 
29
  from src.agent_factory.judges import HFInferenceJudgeHandler, JudgeHandler, MockJudgeHandler
30
  from src.orchestrator_factory import create_orchestrator
31
+ from src.services.audio_processing import get_audio_service
32
+ from src.services.multimodal_processing import get_multimodal_service
33
  from src.tools.clinicaltrials import ClinicalTrialsTool
34
  from src.tools.europepmc import EuropePMCTool
35
  from src.tools.pubmed import PubMedTool
 
44
  oauth_token: str | None = None,
45
  hf_model: str | None = None,
46
  hf_provider: str | None = None,
47
+ graph_mode: str | None = None,
48
+ use_graph: bool = True,
49
  ) -> tuple[Any, str]:
50
  """
51
  Create an orchestrator instance.
52
 
53
  Args:
54
  use_mock: If True, use MockJudgeHandler (no API key needed)
55
+ mode: Orchestrator mode ("simple", "advanced", "iterative", "deep", or "auto")
56
  oauth_token: Optional OAuth token from HuggingFace login
57
  hf_model: Selected HuggingFace model ID
58
  hf_provider: Selected inference provider
59
+ graph_mode: Graph research mode ("iterative", "deep", or "auto") - used when mode is graph-based
60
+ use_graph: Whether to use graph execution (True) or agent chains (False)
61
 
62
  Returns:
63
  Tuple of (Orchestrator instance, backend_name)
 
68
  max_results_per_tool=10,
69
  )
70
 
71
+ # Create search tools with RAG enabled
72
+ # Pass OAuth token to SearchHandler so it can be used by RAG service
73
  search_handler = SearchHandler(
74
  tools=[PubMedTool(), ClinicalTrialsTool(), EuropePMCTool()],
75
  timeout=config.search_timeout,
76
+ include_rag=True,
77
+ auto_ingest_to_rag=True,
78
+ oauth_token=oauth_token,
79
  )
80
 
81
  # Create judge (mock, real, or free tier)
 
121
  # 3. Free Tier (HuggingFace Inference) - NO API KEY AVAILABLE
122
  else:
123
  # No API key available - use HFInferenceJudgeHandler with public models
124
+ # HFInferenceJudgeHandler will use HF_TOKEN from env if available, otherwise public models
125
+ # Note: OAuth token should have been caught in effective_api_key check above
126
+ # If we reach here, we truly have no API key, so use public models
127
  judge_handler = HFInferenceJudgeHandler(
128
+ model_id=hf_model if hf_model else None,
129
+ api_key=None, # Will use HF_TOKEN from env if available, otherwise public models
 
130
  )
131
  model_display = hf_model.split("/")[-1] if hf_model else "Default (Public Models)"
132
  backend_info = f"Free Tier ({model_display} - Public Models Only)"
133
 
134
+ # Determine effective mode
135
+ # If mode is already iterative/deep/auto, use it directly
136
+ # If mode is "graph" or "simple", use graph_mode if provided
137
+ effective_mode = mode
138
+ if mode in ("graph", "simple") and graph_mode:
139
+ effective_mode = graph_mode
140
+ elif mode == "graph" and not graph_mode:
141
+ effective_mode = "auto" # Default to auto if graph mode but no graph_mode specified
142
+
143
  orchestrator = create_orchestrator(
144
  search_handler=search_handler,
145
  judge_handler=judge_handler,
146
  config=config,
147
+ mode=effective_mode, # type: ignore
148
  )
149
 
150
  return orchestrator, backend_info
 
425
 
426
 
427
  async def research_agent(
428
+ message: str | MultimodalPostprocess,
429
  history: list[dict[str, Any]],
430
  mode: str = "simple",
431
  hf_model: str | None = None,
432
  hf_provider: str | None = None,
433
+ graph_mode: str = "auto",
434
+ use_graph: bool = True,
435
+ tts_voice: str = "af_heart",
436
+ tts_speed: float = 1.0,
437
  oauth_token: gr.OAuthToken | None = None,
438
  oauth_profile: gr.OAuthProfile | None = None,
439
+ ) -> AsyncGenerator[dict[str, Any] | tuple[dict[str, Any], tuple[int, np.ndarray] | None], None]:
440
  """
441
  Gradio chat function that runs the research agent.
442
 
443
  Args:
444
+ message: User's research question (str or MultimodalPostprocess with text/files)
445
  history: Chat history (Gradio format)
446
  mode: Orchestrator mode ("simple" or "advanced")
447
  hf_model: Selected HuggingFace model ID (from dropdown)
 
450
  oauth_profile: Gradio OAuth profile (None if user not logged in)
451
 
452
  Yields:
453
+ ChatMessage objects with metadata for accordion display, optionally with audio output
454
  """
455
+ import structlog
456
+
457
+ logger = structlog.get_logger()
458
+
459
  # REQUIRE LOGIN BEFORE USE
460
  # Extract OAuth token and username using Gradio's OAuth types
461
  # According to Gradio docs: OAuthToken and OAuthProfile are None if user not logged in
 
493
  "before using this application.\n\n"
494
  "The login button is required to access the AI models and research tools."
495
  ),
496
+ }, None
497
  return
498
 
499
+ # Process multimodal input (text + images + audio)
500
+ processed_text = ""
501
+ audio_input_data: tuple[int, np.ndarray] | None = None
502
+
503
+ if isinstance(message, dict):
504
+ # MultimodalPostprocess format: {"text": str, "files": list[FileData]}
505
+ processed_text = message.get("text", "") or ""
506
+ files = message.get("files", [])
507
+
508
+ # Process multimodal input (images, audio files)
509
+ if files and settings.enable_image_input:
510
+ try:
511
+ multimodal_service = get_multimodal_service()
512
+ processed_text = await multimodal_service.process_multimodal_input(
513
+ processed_text, files=files, hf_token=token_value
514
+ )
515
+ except Exception as e:
516
+ logger.warning("multimodal_processing_failed", error=str(e))
517
+ # Continue with text-only input
518
+ else:
519
+ # Plain string message
520
+ processed_text = str(message) if message else ""
521
+
522
+ if not processed_text.strip():
523
  yield {
524
  "role": "assistant",
525
+ "content": "Please enter a research question or provide an image/audio input.",
526
+ }, None
527
  return
528
 
529
  # Check available keys (use token_value instead of oauth_token)
 
552
  oauth_token=token_value, # Use extracted token value
553
  hf_model=model_id, # None will use defaults in configure_orchestrator
554
  hf_provider=provider_name, # None will use defaults in configure_orchestrator
555
+ graph_mode=graph_mode if graph_mode else None,
556
+ use_graph=use_graph,
557
  )
558
 
559
  yield {
 
561
  "content": f"🧠 **Backend**: {backend_name}\n\n",
562
  }
563
 
564
+ # Handle orchestrator events and generate audio output
565
+ audio_output_data: tuple[int, np.ndarray] | None = None
566
+ final_message = ""
567
+
568
+ async for msg in handle_orchestrator_events(orchestrator, processed_text):
569
+ # Track final message for TTS
570
+ if isinstance(msg, dict) and msg.get("role") == "assistant":
571
+ content = msg.get("content", "")
572
+ metadata = msg.get("metadata", {})
573
+ # This is the main response (not an accordion) if no title in metadata
574
+ if content and not metadata.get("title"):
575
+ final_message = content
576
+
577
+ # Yield without audio for intermediate messages
578
+ yield msg, None
579
+
580
+ # Generate audio output for final response
581
+ if final_message and settings.enable_audio_output:
582
+ try:
583
+ audio_service = get_audio_service()
584
+ # Use UI-configured voice and speed, fallback to settings defaults
585
+ audio_output_data = await audio_service.generate_audio_output(
586
+ final_message,
587
+ voice=tts_voice or settings.tts_voice,
588
+ speed=tts_speed if tts_speed else settings.tts_speed,
589
+ )
590
+ except Exception as e:
591
+ logger.warning("audio_synthesis_failed", error=str(e))
592
+ # Continue without audio output
593
+
594
+ # If we have audio output, we need to yield it with the final message
595
+ # Note: The final message was already yielded above, so we yield None, audio_output_data
596
+ # This will update the audio output component
597
+ if audio_output_data is not None:
598
+ yield None, audio_output_data
599
 
600
  except Exception as e:
601
  # Return error message without metadata to avoid issues during example caching
 
606
  yield {
607
  "role": "assistant",
608
  "content": f"Error: {error_msg}. Please check your configuration and try again.",
609
+ }, None
610
 
611
 
612
  def create_demo() -> gr.Blocks:
 
651
  ),
652
  )
653
 
654
+ # Graph mode selection
655
+ graph_mode_radio = gr.Radio(
656
+ choices=["iterative", "deep", "auto"],
657
+ value="auto",
658
+ label="Graph Research Mode",
659
+ info="Iterative: Single loop | Deep: Parallel sections | Auto: Detect from query",
660
+ )
661
+
662
+ # Graph execution toggle
663
+ use_graph_checkbox = gr.Checkbox(
664
+ value=True,
665
+ label="Use Graph Execution",
666
+ info="Enable graph-based workflow execution",
667
+ )
668
+
669
+ # TTS Configuration (in Settings accordion)
670
+ with gr.Accordion("🎤 Audio Settings", open=False, visible=settings.enable_audio_output):
671
+ tts_voice_dropdown = gr.Dropdown(
672
+ choices=[
673
+ "af_heart",
674
+ "af_bella",
675
+ "af_nicole",
676
+ "af_aoede",
677
+ "af_kore",
678
+ "af_sarah",
679
+ "af_nova",
680
+ "af_sky",
681
+ "af_alloy",
682
+ "af_jessica",
683
+ "af_river",
684
+ "am_michael",
685
+ "am_fenrir",
686
+ "am_puck",
687
+ "am_echo",
688
+ "am_eric",
689
+ "am_liam",
690
+ "am_onyx",
691
+ "am_santa",
692
+ "am_adam",
693
+ ],
694
+ value=settings.tts_voice,
695
+ label="Voice",
696
+ info="Select TTS voice (American English voices: af_*, am_*)",
697
+ )
698
+ tts_speed_slider = gr.Slider(
699
+ minimum=0.5,
700
+ maximum=2.0,
701
+ value=settings.tts_speed,
702
+ step=0.1,
703
+ label="Speech Speed",
704
+ info="Adjust TTS speech speed (0.5x to 2.0x)",
705
+ )
706
+ tts_gpu_dropdown = gr.Dropdown(
707
+ choices=["T4", "A10", "A100", "L4", "L40S"],
708
+ value=settings.tts_gpu or "T4",
709
+ label="GPU Type",
710
+ info="Modal GPU type for TTS (T4 is cheapest, A100 is fastest). Note: GPU changes require app restart.",
711
+ visible=settings.modal_available,
712
+ interactive=False, # GPU type set at function definition time, requires restart
713
+ )
714
+ enable_audio_output_checkbox = gr.Checkbox(
715
+ value=settings.enable_audio_output,
716
+ label="Enable Audio Output",
717
+ info="Generate audio responses using TTS",
718
+ )
719
+
720
  # Hidden text components for model/provider (not dropdowns to avoid value mismatch)
721
  # These will be empty by default and use defaults in configure_orchestrator
722
  with gr.Row(visible=False):
 
732
  visible=False, # Hidden from UI
733
  )
734
 
735
+ # Audio output component (for TTS response)
736
+ audio_output = gr.Audio(
737
+ label="🔊 Audio Response",
738
+ visible=settings.enable_audio_output,
739
+ )
740
+
741
+ # Chat interface with multimodal support
742
  # Examples are provided but will NOT run at startup (cache_examples=False)
743
  # Users must log in first before using examples or submitting queries
744
  gr.ChatInterface(
745
  fn=research_agent,
746
+ multimodal=True, # Enable multimodal input (text + images + audio)
747
  title="🧬 DeepCritical",
748
  description=(
749
  "*AI-Powered Drug Repurposing Agent — searches PubMed, "
 
751
  "---\n"
752
  "*Research tool only — not for medical advice.* \n"
753
  "**MCP Server Active**: Connect Claude Desktop to `/gradio_api/mcp/`\n\n"
754
+ "**🎤 Multimodal Support**: Upload images (OCR), record audio (STT), or type text.\n\n"
755
  "**⚠️ Authentication Required**: Please **sign in with HuggingFace** above before using this application."
756
  ),
757
  examples=[
 
765
  "simple",
766
  "Qwen/Qwen3-Next-80B-A3B-Thinking",
767
  "",
768
+ "auto",
769
+ True,
770
  ],
771
  [
772
  "Is metformin effective for treating cancer? Investigate mechanism of action.",
773
  "iterative",
774
  "Qwen/Qwen3-235B-A22B-Instruct-2507",
775
  "",
776
+ "iterative",
777
+ True,
778
  ],
779
  [
780
  "Create a comprehensive report on Long COVID treatments including clinical trials, mechanisms, and safety.",
781
  "deep",
782
  "zai-org/GLM-4.5-Air",
783
  "nebius",
784
+ "deep",
785
+ True,
786
  ],
787
  ],
788
  cache_examples=False, # CRITICAL: Disable example caching to prevent examples from running at startup
 
792
  mode_radio,
793
  hf_model_dropdown,
794
  hf_provider_dropdown,
795
+ graph_mode_radio,
796
+ use_graph_checkbox,
797
+ tts_voice_dropdown,
798
+ tts_speed_slider,
799
  # Note: gr.OAuthToken and gr.OAuthProfile are automatically passed as function parameters
800
  # when user is logged in - they should NOT be added to additional_inputs
801
  ],
802
+ additional_outputs=[audio_output], # Add audio output for TTS
803
  )
804
 
805
  return demo # type: ignore[no-any-return]
 
812
  # server_name="0.0.0.0",
813
  # server_port=7860,
814
  # share=False,
815
+ mcp_server=True, # Enable MCP server for Claude Desktop integration
816
  ssr_mode=False, # Fix for intermittent loading/hydration issues in HF Spaces
817
  )
818
 
src/mcp_tools.py CHANGED
@@ -223,3 +223,81 @@ async def analyze_hypothesis(
223
 
224
  **Executed in Modal Sandbox** - Isolated, secure, reproducible.
225
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  **Executed in Modal Sandbox** - Isolated, secure, reproducible.
225
  """
226
+
227
+
228
+ async def extract_text_from_image(
229
+ image_path: str, model: str | None = None, hf_token: str | None = None
230
+ ) -> str:
231
+ """Extract text from an image using OCR.
232
+
233
+ Uses the Multimodal-OCR3 Gradio Space to extract text from images.
234
+ Supports various image formats (PNG, JPG, etc.) and can extract text
235
+ from scanned documents, screenshots, and other image types.
236
+
237
+ Args:
238
+ image_path: Path to image file
239
+ model: Optional model selection (default: None, uses API default)
240
+
241
+ Returns:
242
+ Extracted text from the image
243
+ """
244
+ from src.services.image_ocr import get_image_ocr_service
245
+
246
+ from src.utils.config import settings
247
+
248
+ try:
249
+ ocr_service = get_image_ocr_service()
250
+ # Use provided token or fallback to env vars
251
+ token = hf_token or settings.hf_token or settings.huggingface_api_key
252
+ extracted_text = await ocr_service.extract_text(image_path, model=model, hf_token=token)
253
+
254
+ if not extracted_text:
255
+ return f"No text found in image: {image_path}"
256
+
257
+ return f"## Extracted Text from Image\n\n{extracted_text}"
258
+
259
+ except Exception as e:
260
+ return f"Error extracting text from image: {e}"
261
+
262
+
263
+ async def transcribe_audio_file(
264
+ audio_path: str,
265
+ source_lang: str | None = None,
266
+ target_lang: str | None = None,
267
+ hf_token: str | None = None,
268
+ ) -> str:
269
+ """Transcribe audio file to text using speech-to-text.
270
+
271
+ Uses the NVIDIA Canary Gradio Space to transcribe audio files.
272
+ Supports various audio formats (WAV, MP3, etc.) and multiple languages.
273
+
274
+ Args:
275
+ audio_path: Path to audio file
276
+ source_lang: Source language (default: "English")
277
+ target_lang: Target language (default: "English")
278
+
279
+ Returns:
280
+ Transcribed text from the audio file
281
+ """
282
+ from src.services.stt_gradio import get_stt_service
283
+
284
+ from src.utils.config import settings
285
+
286
+ try:
287
+ stt_service = get_stt_service()
288
+ # Use provided token or fallback to env vars
289
+ token = hf_token or settings.hf_token or settings.huggingface_api_key
290
+ transcribed_text = await stt_service.transcribe_file(
291
+ audio_path,
292
+ source_lang=source_lang,
293
+ target_lang=target_lang,
294
+ hf_token=token,
295
+ )
296
+
297
+ if not transcribed_text:
298
+ return f"No transcription found in audio: {audio_path}"
299
+
300
+ return f"## Audio Transcription\n\n{transcribed_text}"
301
+
302
+ except Exception as e:
303
+ return f"Error transcribing audio: {e}"
src/middleware/state_machine.py CHANGED
@@ -131,3 +131,5 @@ def get_workflow_state() -> WorkflowState:
131
 
132
 
133
 
 
 
 
131
 
132
 
133
 
134
+
135
+
src/orchestrator/graph_orchestrator.py CHANGED
@@ -28,6 +28,7 @@ from src.agent_factory.graph_builder import (
28
  create_deep_graph,
29
  create_iterative_graph,
30
  )
 
31
  from src.middleware.budget_tracker import BudgetTracker
32
  from src.middleware.state_machine import WorkflowState, init_workflow_state
33
  from src.orchestrator.research_flow import DeepResearchFlow, IterativeResearchFlow
@@ -121,6 +122,8 @@ class GraphOrchestrator:
121
  max_iterations: int = 5,
122
  max_time_minutes: int = 10,
123
  use_graph: bool = True,
 
 
124
  ) -> None:
125
  """
126
  Initialize graph orchestrator.
@@ -130,11 +133,15 @@ class GraphOrchestrator:
130
  max_iterations: Maximum iterations per loop
131
  max_time_minutes: Maximum time per loop
132
  use_graph: Whether to use graph execution (True) or agent chains (False)
 
 
133
  """
134
  self.mode = mode
135
  self.max_iterations = max_iterations
136
  self.max_time_minutes = max_time_minutes
137
  self.use_graph = use_graph
 
 
138
  self.logger = logger
139
 
140
  # Initialize flows (for backward compatibility)
@@ -248,6 +255,7 @@ class GraphOrchestrator:
248
  self._iterative_flow = IterativeResearchFlow(
249
  max_iterations=self.max_iterations,
250
  max_time_minutes=self.max_time_minutes,
 
251
  )
252
 
253
  try:
@@ -278,6 +286,8 @@ class GraphOrchestrator:
278
  )
279
 
280
  if self._deep_flow is None:
 
 
281
  self._deep_flow = DeepResearchFlow(
282
  max_iterations=self.max_iterations,
283
  max_time_minutes=self.max_time_minutes,
@@ -640,6 +650,34 @@ class GraphOrchestrator:
640
  tokens = result.usage.total_tokens if hasattr(result.usage, "total_tokens") else 0
641
  context.budget_tracker.add_tokens("graph_execution", tokens)
642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  return output
644
 
645
  async def _execute_state_node(
@@ -650,6 +688,7 @@ class GraphOrchestrator:
650
  Special handling for deep research state nodes:
651
  - "store_plan": Stores ReportPlan in context for parallel loops
652
  - "collect_drafts": Stores section drafts in context for synthesizer
 
653
 
654
  Args:
655
  node: The state node
@@ -659,6 +698,58 @@ class GraphOrchestrator:
659
  Returns:
660
  State update result
661
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  # Get previous result for state update
663
  # For "store_plan", get from planner node
664
  # For "collect_drafts", get from parallel_loops node
@@ -797,8 +888,10 @@ class GraphOrchestrator:
797
  sections=len(report_plan.report_outline),
798
  )
799
 
800
- # Create judge handler for iterative flows
801
- judge_handler = create_judge_handler()
 
 
802
 
803
  # Create and execute iterative research flows for each section
804
  async def run_section_research(section_index: int) -> str:
@@ -812,7 +905,7 @@ class GraphOrchestrator:
812
  max_time_minutes=self.max_time_minutes,
813
  verbose=False, # Less verbose in parallel execution
814
  use_graph=False, # Use agent chains for section research
815
- judge_handler=judge_handler,
816
  )
817
 
818
  # Run research for this section
@@ -953,6 +1046,8 @@ def create_graph_orchestrator(
953
  max_iterations: int = 5,
954
  max_time_minutes: int = 10,
955
  use_graph: bool = True,
 
 
956
  ) -> GraphOrchestrator:
957
  """
958
  Factory function to create a graph orchestrator.
@@ -962,6 +1057,8 @@ def create_graph_orchestrator(
962
  max_iterations: Maximum iterations per loop
963
  max_time_minutes: Maximum time per loop
964
  use_graph: Whether to use graph execution (True) or agent chains (False)
 
 
965
 
966
  Returns:
967
  Configured GraphOrchestrator instance
@@ -971,4 +1068,6 @@ def create_graph_orchestrator(
971
  max_iterations=max_iterations,
972
  max_time_minutes=max_time_minutes,
973
  use_graph=use_graph,
 
 
974
  )
 
28
  create_deep_graph,
29
  create_iterative_graph,
30
  )
31
+ from src.legacy_orchestrator import JudgeHandlerProtocol, SearchHandlerProtocol
32
  from src.middleware.budget_tracker import BudgetTracker
33
  from src.middleware.state_machine import WorkflowState, init_workflow_state
34
  from src.orchestrator.research_flow import DeepResearchFlow, IterativeResearchFlow
 
122
  max_iterations: int = 5,
123
  max_time_minutes: int = 10,
124
  use_graph: bool = True,
125
+ search_handler: SearchHandlerProtocol | None = None,
126
+ judge_handler: JudgeHandlerProtocol | None = None,
127
  ) -> None:
128
  """
129
  Initialize graph orchestrator.
 
133
  max_iterations: Maximum iterations per loop
134
  max_time_minutes: Maximum time per loop
135
  use_graph: Whether to use graph execution (True) or agent chains (False)
136
+ search_handler: Optional search handler for tool execution
137
+ judge_handler: Optional judge handler for evidence assessment
138
  """
139
  self.mode = mode
140
  self.max_iterations = max_iterations
141
  self.max_time_minutes = max_time_minutes
142
  self.use_graph = use_graph
143
+ self.search_handler = search_handler
144
+ self.judge_handler = judge_handler
145
  self.logger = logger
146
 
147
  # Initialize flows (for backward compatibility)
 
255
  self._iterative_flow = IterativeResearchFlow(
256
  max_iterations=self.max_iterations,
257
  max_time_minutes=self.max_time_minutes,
258
+ judge_handler=self.judge_handler,
259
  )
260
 
261
  try:
 
286
  )
287
 
288
  if self._deep_flow is None:
289
+ # DeepResearchFlow creates its own judge_handler internally
290
+ # The judge_handler is passed to IterativeResearchFlow in parallel loops
291
  self._deep_flow = DeepResearchFlow(
292
  max_iterations=self.max_iterations,
293
  max_time_minutes=self.max_time_minutes,
 
650
  tokens = result.usage.total_tokens if hasattr(result.usage, "total_tokens") else 0
651
  context.budget_tracker.add_tokens("graph_execution", tokens)
652
 
653
+ # Special handling for knowledge_gap node: optionally call judge_handler
654
+ if node.node_id == "knowledge_gap" and self.judge_handler:
655
+ # Get evidence from workflow state
656
+ evidence = context.state.evidence
657
+ if evidence:
658
+ try:
659
+ from src.utils.models import JudgeAssessment
660
+
661
+ # Call judge handler to assess evidence
662
+ judge_assessment: JudgeAssessment = await self.judge_handler.assess(
663
+ question=query, evidence=evidence
664
+ )
665
+ # Store assessment in context for decision node to use
666
+ context.set_node_result("judge_assessment", judge_assessment)
667
+ self.logger.info(
668
+ "Judge assessment completed",
669
+ sufficient=judge_assessment.sufficient,
670
+ confidence=judge_assessment.confidence,
671
+ recommendation=judge_assessment.recommendation,
672
+ )
673
+ except Exception as e:
674
+ self.logger.warning(
675
+ "Judge handler assessment failed",
676
+ error=str(e),
677
+ node_id=node.node_id,
678
+ )
679
+ # Continue without judge assessment
680
+
681
  return output
682
 
683
  async def _execute_state_node(
 
688
  Special handling for deep research state nodes:
689
  - "store_plan": Stores ReportPlan in context for parallel loops
690
  - "collect_drafts": Stores section drafts in context for synthesizer
691
+ - "execute_tools": Executes search using search_handler
692
 
693
  Args:
694
  node: The state node
 
698
  Returns:
699
  State update result
700
  """
701
+ # Special handling for execute_tools node
702
+ if node.node_id == "execute_tools":
703
+ # Get AgentSelectionPlan from tool_selector node result
704
+ tool_selector_result = context.get_node_result("tool_selector")
705
+ from src.utils.models import AgentSelectionPlan, SearchResult
706
+
707
+ # Extract query from context or use original query
708
+ search_query = query
709
+ if tool_selector_result and isinstance(tool_selector_result, AgentSelectionPlan):
710
+ # Use the gap or query from the selection plan
711
+ if tool_selector_result.tasks:
712
+ # Use the first task's query if available
713
+ first_task = tool_selector_result.tasks[0]
714
+ if hasattr(first_task, "query") and first_task.query:
715
+ search_query = first_task.query
716
+ elif hasattr(first_task, "tool_input") and isinstance(
717
+ first_task.tool_input, str
718
+ ):
719
+ search_query = first_task.tool_input
720
+
721
+ # Execute search using search_handler
722
+ if self.search_handler:
723
+ try:
724
+ search_result: SearchResult = await self.search_handler.execute(
725
+ query=search_query, max_results_per_tool=10
726
+ )
727
+ # Add evidence to workflow state (add_evidence expects a list)
728
+ context.state.add_evidence(search_result.evidence)
729
+ # Store evidence list in context for next nodes
730
+ context.set_node_result(node.node_id, search_result.evidence)
731
+ self.logger.info(
732
+ "Tools executed via search_handler",
733
+ query=search_query[:100],
734
+ evidence_count=len(search_result.evidence),
735
+ )
736
+ return search_result.evidence
737
+ except Exception as e:
738
+ self.logger.error(
739
+ "Search handler execution failed",
740
+ error=str(e),
741
+ query=search_query[:100],
742
+ )
743
+ # Return empty list on error to allow graph to continue
744
+ return []
745
+ else:
746
+ # Fallback: log warning and return empty list
747
+ self.logger.warning(
748
+ "Search handler not available for execute_tools node",
749
+ node_id=node.node_id,
750
+ )
751
+ return []
752
+
753
  # Get previous result for state update
754
  # For "store_plan", get from planner node
755
  # For "collect_drafts", get from parallel_loops node
 
888
  sections=len(report_plan.report_outline),
889
  )
890
 
891
+ # Use judge handler from GraphOrchestrator if available, otherwise create new one
892
+ judge_handler = self.judge_handler
893
+ if judge_handler is None:
894
+ judge_handler = create_judge_handler()
895
 
896
  # Create and execute iterative research flows for each section
897
  async def run_section_research(section_index: int) -> str:
 
905
  max_time_minutes=self.max_time_minutes,
906
  verbose=False, # Less verbose in parallel execution
907
  use_graph=False, # Use agent chains for section research
908
+ judge_handler=self.judge_handler or judge_handler,
909
  )
910
 
911
  # Run research for this section
 
1046
  max_iterations: int = 5,
1047
  max_time_minutes: int = 10,
1048
  use_graph: bool = True,
1049
+ search_handler: SearchHandlerProtocol | None = None,
1050
+ judge_handler: JudgeHandlerProtocol | None = None,
1051
  ) -> GraphOrchestrator:
1052
  """
1053
  Factory function to create a graph orchestrator.
 
1057
  max_iterations: Maximum iterations per loop
1058
  max_time_minutes: Maximum time per loop
1059
  use_graph: Whether to use graph execution (True) or agent chains (False)
1060
+ search_handler: Optional search handler for tool execution
1061
+ judge_handler: Optional judge handler for evidence assessment
1062
 
1063
  Returns:
1064
  Configured GraphOrchestrator instance
 
1068
  max_iterations=max_iterations,
1069
  max_time_minutes=max_time_minutes,
1070
  use_graph=use_graph,
1071
+ search_handler=search_handler,
1072
+ judge_handler=judge_handler,
1073
  )
src/orchestrator_factory.py CHANGED
@@ -81,6 +81,8 @@ def create_orchestrator(
81
  max_iterations=config.max_iterations if config else 5,
82
  max_time_minutes=10,
83
  use_graph=True,
 
 
84
  )
85
 
86
  # Simple mode requires handlers
 
81
  max_iterations=config.max_iterations if config else 5,
82
  max_time_minutes=10,
83
  use_graph=True,
84
+ search_handler=search_handler,
85
+ judge_handler=judge_handler,
86
  )
87
 
88
  # Simple mode requires handlers
src/services/audio_processing.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified audio processing service for STT and TTS integration."""
2
+
3
+ from functools import lru_cache
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ import structlog
8
+
9
+ from src.services.stt_gradio import STTService, get_stt_service
10
+ from src.utils.config import settings
11
+ from src.utils.exceptions import ConfigurationError
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+ # Type stub for TTS service (will be imported when available)
16
+ try:
17
+ from src.services.tts_modal import TTSService, get_tts_service
18
+
19
+ _TTS_AVAILABLE = True
20
+ except ImportError:
21
+ _TTS_AVAILABLE = False
22
+ TTSService = None # type: ignore[assignment, misc]
23
+ get_tts_service = None # type: ignore[assignment, misc]
24
+
25
+
26
+ class AudioService:
27
+ """Unified audio processing service."""
28
+
29
+ def __init__(
30
+ self,
31
+ stt_service: STTService | None = None,
32
+ tts_service: Any | None = None,
33
+ ) -> None:
34
+ """Initialize audio service with STT and TTS.
35
+
36
+ Args:
37
+ stt_service: STT service instance (default: get_stt_service())
38
+ tts_service: TTS service instance (default: get_tts_service() if available)
39
+ """
40
+ self.stt = stt_service or get_stt_service()
41
+
42
+ # TTS is optional (requires Modal)
43
+ if tts_service is not None:
44
+ self.tts = tts_service
45
+ elif _TTS_AVAILABLE and settings.modal_available:
46
+ try:
47
+ self.tts = get_tts_service() # type: ignore[misc]
48
+ except Exception as e:
49
+ logger.warning("tts_service_unavailable", error=str(e))
50
+ self.tts = None
51
+ else:
52
+ self.tts = None
53
+
54
+ async def process_audio_input(
55
+ self,
56
+ audio_input: tuple[int, np.ndarray] | None,
57
+ hf_token: str | None = None,
58
+ ) -> str | None:
59
+ """Process audio input and return transcribed text.
60
+
61
+ Args:
62
+ audio_input: Tuple of (sample_rate, audio_array) or None
63
+ hf_token: HuggingFace token for authenticated Gradio Spaces
64
+
65
+ Returns:
66
+ Transcribed text string or None if no audio input
67
+ """
68
+ if audio_input is None:
69
+ return None
70
+
71
+ try:
72
+ transcribed_text = await self.stt.transcribe_audio(audio_input, hf_token=hf_token)
73
+ logger.info("audio_input_processed", text_length=len(transcribed_text))
74
+ return transcribed_text
75
+ except Exception as e:
76
+ logger.error("audio_input_processing_failed", error=str(e))
77
+ # Return None on failure (graceful degradation)
78
+ return None
79
+
80
+ async def generate_audio_output(
81
+ self,
82
+ text: str,
83
+ voice: str | None = None,
84
+ speed: float | None = None,
85
+ ) -> tuple[int, np.ndarray] | None:
86
+ """Generate audio output from text.
87
+
88
+ Args:
89
+ text: Text to synthesize
90
+ voice: Voice ID (default: settings.tts_voice)
91
+ speed: Speech speed (default: settings.tts_speed)
92
+
93
+ Returns:
94
+ Tuple of (sample_rate, audio_array) or None if TTS unavailable
95
+ """
96
+ if self.tts is None:
97
+ logger.warning("tts_unavailable", message="TTS service not available")
98
+ return None
99
+
100
+ if not text or not text.strip():
101
+ logger.warning("empty_text_for_tts")
102
+ return None
103
+
104
+ try:
105
+ # Use provided voice/speed or fallback to settings defaults
106
+ voice = voice if voice else settings.tts_voice
107
+ speed = speed if speed is not None else settings.tts_speed
108
+
109
+ audio_output = await self.tts.synthesize_async(text, voice, speed) # type: ignore[misc]
110
+
111
+ if audio_output:
112
+ logger.info(
113
+ "audio_output_generated",
114
+ text_length=len(text),
115
+ sample_rate=audio_output[0],
116
+ )
117
+
118
+ return audio_output
119
+
120
+ except Exception as e:
121
+ logger.error("audio_output_generation_failed", error=str(e))
122
+ # Return None on failure (graceful degradation)
123
+ return None
124
+
125
+
126
+ @lru_cache(maxsize=1)
127
+ def get_audio_service() -> AudioService:
128
+ """Get or create singleton audio service instance.
129
+
130
+ Returns:
131
+ AudioService instance
132
+ """
133
+ return AudioService()
134
+
src/services/image_ocr.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image-to-text service using Gradio Client API (Multimodal-OCR3)."""
2
+
3
+ import asyncio
4
+ import tempfile
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ import structlog
11
+ from gradio_client import Client, handle_file
12
+ from PIL import Image
13
+
14
+ from src.utils.config import settings
15
+ from src.utils.exceptions import ConfigurationError
16
+
17
+ logger = structlog.get_logger(__name__)
18
+
19
+
20
+ class ImageOCRService:
21
+ """Image OCR service using prithivMLmods/Multimodal-OCR3 Gradio Space."""
22
+
23
+ def __init__(self, api_url: str | None = None, hf_token: str | None = None) -> None:
24
+ """Initialize Image OCR service.
25
+
26
+ Args:
27
+ api_url: Gradio Space URL (default: settings.ocr_api_url)
28
+ hf_token: HuggingFace token for authenticated Spaces (default: None)
29
+
30
+ Raises:
31
+ ConfigurationError: If API URL not configured
32
+ """
33
+ self.api_url = api_url or settings.ocr_api_url
34
+ if not self.api_url:
35
+ raise ConfigurationError("OCR API URL not configured")
36
+ self.hf_token = hf_token
37
+ self.client: Client | None = None
38
+
39
+ async def _get_client(self, hf_token: str | None = None) -> Client:
40
+ """Get or create Gradio Client (lazy initialization).
41
+
42
+ Args:
43
+ hf_token: HuggingFace token for authenticated Spaces (overrides instance token)
44
+
45
+ Returns:
46
+ Gradio Client instance
47
+ """
48
+ # Use provided token or instance token
49
+ token = hf_token or self.hf_token
50
+
51
+ # If client exists but token changed, recreate it
52
+ if self.client is not None and token != self.hf_token:
53
+ self.client = None
54
+
55
+ if self.client is None:
56
+ loop = asyncio.get_running_loop()
57
+ # Pass token to Client for authenticated Spaces
58
+ if token:
59
+ self.client = await loop.run_in_executor(
60
+ None,
61
+ lambda: Client(self.api_url, hf_token=token),
62
+ )
63
+ else:
64
+ self.client = await loop.run_in_executor(
65
+ None,
66
+ lambda: Client(self.api_url),
67
+ )
68
+ # Update instance token for future use
69
+ self.hf_token = token
70
+ return self.client
71
+
72
+ async def extract_text(
73
+ self,
74
+ image_path: str,
75
+ model: str | None = None,
76
+ hf_token: str | None = None,
77
+ ) -> str:
78
+ """Extract text from image using Gradio API.
79
+
80
+ Args:
81
+ image_path: Path to image file
82
+ model: Optional model selection (default: None, uses API default)
83
+
84
+ Returns:
85
+ Extracted text string
86
+
87
+ Raises:
88
+ ConfigurationError: If OCR extraction fails
89
+ """
90
+ client = await self._get_client(hf_token=hf_token)
91
+
92
+ logger.info(
93
+ "extracting_text_from_image",
94
+ image_path=image_path,
95
+ model=model,
96
+ )
97
+
98
+ try:
99
+ # Call /Multimodal_OCR3_generate_image API endpoint
100
+ # According to the MCP tool description, this yields raw text and Markdown-formatted text
101
+ loop = asyncio.get_running_loop()
102
+
103
+ # The API might require file upload first, then call the generate function
104
+ # For now, we'll use handle_file to upload and pass the path
105
+ result = await loop.run_in_executor(
106
+ None,
107
+ lambda: client.predict(
108
+ image_path=handle_file(image_path),
109
+ api_name="/Multimodal_OCR3_generate_image",
110
+ ),
111
+ )
112
+
113
+ # Extract text from result
114
+ extracted_text = self._extract_text_from_result(result)
115
+
116
+ logger.info(
117
+ "image_ocr_complete",
118
+ text_length=len(extracted_text),
119
+ )
120
+
121
+ return extracted_text
122
+
123
+ except Exception as e:
124
+ logger.error("image_ocr_failed", error=str(e), error_type=type(e).__name__)
125
+ raise ConfigurationError(f"Image OCR failed: {e}") from e
126
+
127
+ async def extract_text_from_image(
128
+ self,
129
+ image_data: np.ndarray | Image.Image | str,
130
+ hf_token: str | None = None,
131
+ ) -> str:
132
+ """Extract text from image data (numpy array, PIL Image, or file path).
133
+
134
+ Args:
135
+ image_data: Image as numpy array, PIL Image, or file path string
136
+
137
+ Returns:
138
+ Extracted text string
139
+ """
140
+ # Handle different input types
141
+ if isinstance(image_data, str):
142
+ # Assume it's a file path
143
+ image_path = image_data
144
+ elif isinstance(image_data, Image.Image):
145
+ # Save PIL Image to temp file
146
+ image_path = self._save_image_temp(image_data)
147
+ elif isinstance(image_data, np.ndarray):
148
+ # Convert numpy array to PIL Image, then save
149
+ pil_image = Image.fromarray(image_data)
150
+ image_path = self._save_image_temp(pil_image)
151
+ else:
152
+ raise ValueError(f"Unsupported image data type: {type(image_data)}")
153
+
154
+ try:
155
+ # Extract text from the image file
156
+ extracted_text = await self.extract_text(image_path, hf_token=hf_token)
157
+ return extracted_text
158
+ finally:
159
+ # Clean up temp file if we created it
160
+ if image_path != image_data or not isinstance(image_data, str):
161
+ try:
162
+ Path(image_path).unlink(missing_ok=True)
163
+ except Exception as e:
164
+ logger.warning("failed_to_cleanup_temp_file", path=image_path, error=str(e))
165
+
166
+ def _extract_text_from_result(self, api_result: Any) -> str:
167
+ """Extract text from API result.
168
+
169
+ Args:
170
+ api_result: Result from Gradio API
171
+
172
+ Returns:
173
+ Extracted text string
174
+ """
175
+ # The API yields raw text and Markdown-formatted text
176
+ # Result might be a string, tuple, or generator
177
+ if isinstance(api_result, str):
178
+ return api_result.strip()
179
+
180
+ if isinstance(api_result, tuple):
181
+ # Try to extract text from tuple
182
+ for item in api_result:
183
+ if isinstance(item, str):
184
+ return item.strip()
185
+ # Check if it's a dict with text fields
186
+ if isinstance(item, dict):
187
+ if "text" in item:
188
+ return str(item["text"]).strip()
189
+ if "content" in item:
190
+ return str(item["content"]).strip()
191
+
192
+ # If result is a generator or async generator, we'd need to iterate
193
+ # For now, convert to string representation
194
+ if api_result is not None:
195
+ text = str(api_result).strip()
196
+ if text and text != "None":
197
+ return text
198
+
199
+ logger.warning("could_not_extract_text_from_result", result_type=type(api_result).__name__)
200
+ return ""
201
+
202
+ def _save_image_temp(self, image: Image.Image) -> str:
203
+ """Save PIL Image to temporary file.
204
+
205
+ Args:
206
+ image: PIL Image object
207
+
208
+ Returns:
209
+ Path to temporary image file
210
+ """
211
+ # Create temp file
212
+ temp_file = tempfile.NamedTemporaryFile(
213
+ suffix=".png",
214
+ delete=False,
215
+ )
216
+ temp_path = temp_file.name
217
+ temp_file.close()
218
+
219
+ try:
220
+ # Save image as PNG
221
+ image.save(temp_path, "PNG")
222
+
223
+ logger.debug("saved_image_temp", path=temp_path, size=image.size)
224
+
225
+ return temp_path
226
+
227
+ except Exception as e:
228
+ logger.error("failed_to_save_image_temp", error=str(e))
229
+ raise ConfigurationError(f"Failed to save image to temp file: {e}") from e
230
+
231
+
232
+ @lru_cache(maxsize=1)
233
+ def get_image_ocr_service() -> ImageOCRService:
234
+ """Get or create singleton Image OCR service instance.
235
+
236
+ Returns:
237
+ ImageOCRService instance
238
+ """
239
+ return ImageOCRService()
240
+
241
+
242
+
src/services/llamaindex_rag.py CHANGED
@@ -40,6 +40,7 @@ class LlamaIndexRAGService:
40
  similarity_top_k: int = 5,
41
  use_openai_embeddings: bool | None = None,
42
  use_in_memory: bool = False,
 
43
  ) -> None:
44
  """
45
  Initialize LlamaIndex RAG service.
@@ -51,6 +52,7 @@ class LlamaIndexRAGService:
51
  similarity_top_k: Number of top results to retrieve
52
  use_openai_embeddings: Force OpenAI embeddings (None = auto-detect)
53
  use_in_memory: Use in-memory ChromaDB client (useful for tests)
 
54
  """
55
  # Import dependencies and store references
56
  deps = self._import_dependencies()
@@ -71,6 +73,7 @@ class LlamaIndexRAGService:
71
  self.persist_dir = persist_dir or settings.chroma_db_path
72
  self.similarity_top_k = similarity_top_k
73
  self.use_in_memory = use_in_memory
 
74
 
75
  # Configure embeddings and LLM
76
  use_openai = use_openai_embeddings if use_openai_embeddings is not None else False
@@ -201,9 +204,15 @@ class LlamaIndexRAGService:
201
 
202
  def _configure_llm(self, huggingface_llm: Any, openai_llm: Any) -> None:
203
  """Configure LLM for query synthesis."""
204
- if huggingface_llm is not None and (settings.hf_token or settings.huggingface_api_key):
 
 
 
 
 
 
205
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
206
- token = settings.hf_token or settings.huggingface_api_key
207
 
208
  # Check if it's HuggingFaceInferenceAPI (API-based) or HuggingFaceLLM (local)
209
  llm_class_name = (
@@ -430,6 +439,7 @@ class LlamaIndexRAGService:
430
 
431
  def get_rag_service(
432
  collection_name: str = "deepcritical_evidence",
 
433
  **kwargs: Any,
434
  ) -> LlamaIndexRAGService:
435
  """
@@ -437,6 +447,7 @@ def get_rag_service(
437
 
438
  Args:
439
  collection_name: Name of the ChromaDB collection
 
440
  **kwargs: Additional arguments for LlamaIndexRAGService
441
  Defaults to use_openai_embeddings=False (local embeddings)
442
 
@@ -450,4 +461,6 @@ def get_rag_service(
450
  # Default to local embeddings if not explicitly set
451
  if "use_openai_embeddings" not in kwargs:
452
  kwargs["use_openai_embeddings"] = False
453
- return LlamaIndexRAGService(collection_name=collection_name, **kwargs)
 
 
 
40
  similarity_top_k: int = 5,
41
  use_openai_embeddings: bool | None = None,
42
  use_in_memory: bool = False,
43
+ oauth_token: str | None = None,
44
  ) -> None:
45
  """
46
  Initialize LlamaIndex RAG service.
 
52
  similarity_top_k: Number of top results to retrieve
53
  use_openai_embeddings: Force OpenAI embeddings (None = auto-detect)
54
  use_in_memory: Use in-memory ChromaDB client (useful for tests)
55
+ oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
56
  """
57
  # Import dependencies and store references
58
  deps = self._import_dependencies()
 
73
  self.persist_dir = persist_dir or settings.chroma_db_path
74
  self.similarity_top_k = similarity_top_k
75
  self.use_in_memory = use_in_memory
76
+ self.oauth_token = oauth_token
77
 
78
  # Configure embeddings and LLM
79
  use_openai = use_openai_embeddings if use_openai_embeddings is not None else False
 
204
 
205
  def _configure_llm(self, huggingface_llm: Any, openai_llm: Any) -> None:
206
  """Configure LLM for query synthesis."""
207
+ # Priority: oauth_token > env vars
208
+ effective_token = (
209
+ self.oauth_token
210
+ or settings.hf_token
211
+ or settings.huggingface_api_key
212
+ )
213
+ if huggingface_llm is not None and effective_token:
214
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
215
+ token = effective_token
216
 
217
  # Check if it's HuggingFaceInferenceAPI (API-based) or HuggingFaceLLM (local)
218
  llm_class_name = (
 
439
 
440
  def get_rag_service(
441
  collection_name: str = "deepcritical_evidence",
442
+ oauth_token: str | None = None,
443
  **kwargs: Any,
444
  ) -> LlamaIndexRAGService:
445
  """
 
447
 
448
  Args:
449
  collection_name: Name of the ChromaDB collection
450
+ oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
451
  **kwargs: Additional arguments for LlamaIndexRAGService
452
  Defaults to use_openai_embeddings=False (local embeddings)
453
 
 
461
  # Default to local embeddings if not explicitly set
462
  if "use_openai_embeddings" not in kwargs:
463
  kwargs["use_openai_embeddings"] = False
464
+ return LlamaIndexRAGService(
465
+ collection_name=collection_name, oauth_token=oauth_token, **kwargs
466
+ )
src/services/multimodal_processing.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified multimodal processing service for text, audio, and image inputs."""
2
+
3
+ from functools import lru_cache
4
+ from typing import Any
5
+
6
+ import structlog
7
+ from gradio.data_classes import FileData
8
+
9
+ from src.services.audio_processing import AudioService, get_audio_service
10
+ from src.services.image_ocr import ImageOCRService, get_image_ocr_service
11
+ from src.utils.config import settings
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+
16
+ class MultimodalService:
17
+ """Unified multimodal processing service."""
18
+
19
+ def __init__(
20
+ self,
21
+ audio_service: AudioService | None = None,
22
+ ocr_service: ImageOCRService | None = None,
23
+ ) -> None:
24
+ """Initialize multimodal service.
25
+
26
+ Args:
27
+ audio_service: Audio service instance (default: get_audio_service())
28
+ ocr_service: Image OCR service instance (default: get_image_ocr_service())
29
+ """
30
+ self.audio = audio_service or get_audio_service()
31
+ self.ocr = ocr_service or get_image_ocr_service()
32
+
33
+ async def process_multimodal_input(
34
+ self,
35
+ text: str,
36
+ files: list[FileData] | None = None,
37
+ audio_input: tuple[int, Any] | None = None,
38
+ hf_token: str | None = None,
39
+ ) -> str:
40
+ """Process multimodal input (text + images + audio) and return combined text.
41
+
42
+ Args:
43
+ text: Text input string
44
+ files: List of uploaded files (images, audio, etc.)
45
+ audio_input: Audio input tuple (sample_rate, audio_array)
46
+ hf_token: HuggingFace token for authenticated Gradio Spaces
47
+
48
+ Returns:
49
+ Combined text from all inputs
50
+ """
51
+ text_parts: list[str] = []
52
+
53
+ # Add original text if present
54
+ if text and text.strip():
55
+ text_parts.append(text.strip())
56
+
57
+ # Process audio input
58
+ if audio_input is not None and settings.enable_audio_input:
59
+ try:
60
+ transcribed = await self.audio.process_audio_input(audio_input, hf_token=hf_token)
61
+ if transcribed:
62
+ text_parts.append(f"[Audio transcription: {transcribed}]")
63
+ except Exception as e:
64
+ logger.warning("audio_processing_failed", error=str(e))
65
+
66
+ # Process uploaded files
67
+ if files:
68
+ for file_data in files:
69
+ file_path = file_data.path if isinstance(file_data, FileData) else str(file_data)
70
+
71
+ # Check if it's an image
72
+ if self._is_image_file(file_path):
73
+ try:
74
+ extracted_text = await self.ocr.extract_text(file_path, hf_token=hf_token)
75
+ if extracted_text:
76
+ text_parts.append(f"[Image OCR: {extracted_text}]")
77
+ except Exception as e:
78
+ logger.warning("image_ocr_failed", file_path=file_path, error=str(e))
79
+
80
+ # Check if it's an audio file
81
+ elif self._is_audio_file(file_path):
82
+ try:
83
+ # For audio files, we'd need to load and transcribe
84
+ # For now, log a warning
85
+ logger.warning("audio_file_upload_not_supported", file_path=file_path)
86
+ except Exception as e:
87
+ logger.warning("audio_file_processing_failed", file_path=file_path, error=str(e))
88
+
89
+ # Combine all text parts
90
+ combined_text = "\n\n".join(text_parts) if text_parts else ""
91
+
92
+ logger.info(
93
+ "multimodal_input_processed",
94
+ text_length=len(combined_text),
95
+ num_files=len(files) if files else 0,
96
+ has_audio=audio_input is not None,
97
+ )
98
+
99
+ return combined_text
100
+
101
+ def _is_image_file(self, file_path: str) -> bool:
102
+ """Check if file is an image.
103
+
104
+ Args:
105
+ file_path: Path to file
106
+
107
+ Returns:
108
+ True if file is an image
109
+ """
110
+ image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".tif"}
111
+ return any(file_path.lower().endswith(ext) for ext in image_extensions)
112
+
113
+ def _is_audio_file(self, file_path: str) -> bool:
114
+ """Check if file is an audio file.
115
+
116
+ Args:
117
+ file_path: Path to file
118
+
119
+ Returns:
120
+ True if file is an audio file
121
+ """
122
+ audio_extensions = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".aac", ".wma"}
123
+ return any(file_path.lower().endswith(ext) for ext in audio_extensions)
124
+
125
+
126
+ @lru_cache(maxsize=1)
127
+ def get_multimodal_service() -> MultimodalService:
128
+ """Get or create singleton multimodal service instance.
129
+
130
+ Returns:
131
+ MultimodalService instance
132
+ """
133
+ return MultimodalService()
134
+
135
+
136
+
src/services/stt_gradio.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Speech-to-Text service using Gradio Client API."""
2
+
3
+ import asyncio
4
+ import tempfile
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ import structlog
11
+ from gradio_client import Client, handle_file
12
+
13
+ from src.utils.config import settings
14
+ from src.utils.exceptions import ConfigurationError
15
+
16
+ logger = structlog.get_logger(__name__)
17
+
18
+
19
+ class STTService:
20
+ """STT service using nvidia/canary-1b-v2 Gradio Space."""
21
+
22
+ def __init__(self, api_url: str | None = None, hf_token: str | None = None) -> None:
23
+ """Initialize STT service.
24
+
25
+ Args:
26
+ api_url: Gradio Space URL (default: settings.stt_api_url)
27
+ hf_token: HuggingFace token for authenticated Spaces (default: None)
28
+
29
+ Raises:
30
+ ConfigurationError: If API URL not configured
31
+ """
32
+ self.api_url = api_url or settings.stt_api_url
33
+ if not self.api_url:
34
+ raise ConfigurationError("STT API URL not configured")
35
+ self.hf_token = hf_token
36
+ self.client: Client | None = None
37
+
38
+ async def _get_client(self, hf_token: str | None = None) -> Client:
39
+ """Get or create Gradio Client (lazy initialization).
40
+
41
+ Args:
42
+ hf_token: HuggingFace token for authenticated Spaces (overrides instance token)
43
+
44
+ Returns:
45
+ Gradio Client instance
46
+ """
47
+ # Use provided token or instance token
48
+ token = hf_token or self.hf_token
49
+
50
+ # If client exists but token changed, recreate it
51
+ if self.client is not None and token != self.hf_token:
52
+ self.client = None
53
+
54
+ if self.client is None:
55
+ loop = asyncio.get_running_loop()
56
+ # Pass token to Client for authenticated Spaces
57
+ if token:
58
+ self.client = await loop.run_in_executor(
59
+ None,
60
+ lambda: Client(self.api_url, hf_token=token),
61
+ )
62
+ else:
63
+ self.client = await loop.run_in_executor(
64
+ None,
65
+ lambda: Client(self.api_url),
66
+ )
67
+ # Update instance token for future use
68
+ self.hf_token = token
69
+ return self.client
70
+
71
+ async def transcribe_file(
72
+ self,
73
+ audio_path: str,
74
+ source_lang: str | None = None,
75
+ target_lang: str | None = None,
76
+ hf_token: str | None = None,
77
+ ) -> str:
78
+ """Transcribe audio file using Gradio API.
79
+
80
+ Args:
81
+ audio_path: Path to audio file
82
+ source_lang: Source language (default: settings.stt_source_lang)
83
+ target_lang: Target language (default: settings.stt_target_lang)
84
+
85
+ Returns:
86
+ Transcribed text string
87
+
88
+ Raises:
89
+ ConfigurationError: If transcription fails
90
+ """
91
+ client = await self._get_client(hf_token=hf_token)
92
+ source_lang = source_lang or settings.stt_source_lang
93
+ target_lang = target_lang or settings.stt_target_lang
94
+
95
+ logger.info(
96
+ "transcribing_audio_file",
97
+ audio_path=audio_path,
98
+ source_lang=source_lang,
99
+ target_lang=target_lang,
100
+ )
101
+
102
+ try:
103
+ # Call /transcribe_file API endpoint
104
+ # API returns: (dataframe, csv_path, srt_path)
105
+ loop = asyncio.get_running_loop()
106
+ result = await loop.run_in_executor(
107
+ None,
108
+ lambda: client.predict(
109
+ audio_path=handle_file(audio_path),
110
+ source_lang=source_lang,
111
+ target_lang=target_lang,
112
+ api_name="/transcribe_file",
113
+ ),
114
+ )
115
+
116
+ # Extract transcription from result
117
+ transcribed_text = self._extract_transcription(result)
118
+
119
+ logger.info(
120
+ "audio_transcription_complete",
121
+ text_length=len(transcribed_text),
122
+ )
123
+
124
+ return transcribed_text
125
+
126
+ except Exception as e:
127
+ logger.error("audio_transcription_failed", error=str(e), error_type=type(e).__name__)
128
+ raise ConfigurationError(f"Audio transcription failed: {e}") from e
129
+
130
+ async def transcribe_audio(
131
+ self,
132
+ audio_data: tuple[int, np.ndarray],
133
+ hf_token: str | None = None,
134
+ ) -> str:
135
+ """Transcribe audio numpy array to text.
136
+
137
+ Args:
138
+ audio_data: Tuple of (sample_rate, audio_array)
139
+
140
+ Returns:
141
+ Transcribed text string
142
+ """
143
+ sample_rate, audio_array = audio_data
144
+
145
+ logger.info(
146
+ "transcribing_audio_array",
147
+ sample_rate=sample_rate,
148
+ audio_shape=audio_array.shape,
149
+ )
150
+
151
+ # Save audio to temp file
152
+ temp_path = self._save_audio_temp(audio_data)
153
+
154
+ try:
155
+ # Transcribe the temp file
156
+ transcribed_text = await self.transcribe_file(temp_path, hf_token=hf_token)
157
+ return transcribed_text
158
+ finally:
159
+ # Clean up temp file
160
+ try:
161
+ Path(temp_path).unlink(missing_ok=True)
162
+ except Exception as e:
163
+ logger.warning("failed_to_cleanup_temp_file", path=temp_path, error=str(e))
164
+
165
+ def _extract_transcription(self, api_result: tuple) -> str:
166
+ """Extract transcription text from API result.
167
+
168
+ Args:
169
+ api_result: Tuple from Gradio API (dataframe, csv_path, srt_path)
170
+
171
+ Returns:
172
+ Extracted transcription text
173
+ """
174
+ # API returns: (dataframe, csv_path, srt_path)
175
+ # Try to extract from dataframe first
176
+ if isinstance(api_result, tuple) and len(api_result) >= 1:
177
+ dataframe = api_result[0]
178
+ if isinstance(dataframe, dict) and "data" in dataframe:
179
+ # Extract text from dataframe rows
180
+ rows = dataframe.get("data", [])
181
+ if rows:
182
+ # Combine all text segments
183
+ text_segments = []
184
+ for row in rows:
185
+ if isinstance(row, list) and len(row) > 0:
186
+ # First column is usually the text
187
+ text_segments.append(str(row[0]))
188
+ if text_segments:
189
+ return " ".join(text_segments)
190
+
191
+ # Fallback: try to read CSV file if available
192
+ if len(api_result) >= 2 and api_result[1]:
193
+ csv_path = api_result[1]
194
+ try:
195
+ import pandas as pd
196
+
197
+ df = pd.read_csv(csv_path)
198
+ if "text" in df.columns:
199
+ return " ".join(df["text"].astype(str).tolist())
200
+ elif len(df.columns) > 0:
201
+ # Use first column
202
+ return " ".join(df.iloc[:, 0].astype(str).tolist())
203
+ except Exception as e:
204
+ logger.warning("failed_to_read_csv", csv_path=csv_path, error=str(e))
205
+
206
+ # Last resort: return empty string
207
+ logger.warning("could_not_extract_transcription", result_type=type(api_result).__name__)
208
+ return ""
209
+
210
+ def _save_audio_temp(
211
+ self,
212
+ audio_data: tuple[int, np.ndarray],
213
+ ) -> str:
214
+ """Save audio numpy array to temporary WAV file.
215
+
216
+ Args:
217
+ audio_data: Tuple of (sample_rate, audio_array)
218
+
219
+ Returns:
220
+ Path to temporary WAV file
221
+ """
222
+ sample_rate, audio_array = audio_data
223
+
224
+ # Create temp file
225
+ temp_file = tempfile.NamedTemporaryFile(
226
+ suffix=".wav",
227
+ delete=False,
228
+ )
229
+ temp_path = temp_file.name
230
+ temp_file.close()
231
+
232
+ # Save audio using soundfile
233
+ try:
234
+ import soundfile as sf
235
+
236
+ # Ensure audio is float32 and mono
237
+ if audio_array.dtype != np.float32:
238
+ audio_array = audio_array.astype(np.float32)
239
+
240
+ # Handle stereo -> mono conversion
241
+ if len(audio_array.shape) > 1:
242
+ audio_array = np.mean(audio_array, axis=1)
243
+
244
+ # Normalize to [-1, 1] range
245
+ if audio_array.max() > 1.0 or audio_array.min() < -1.0:
246
+ audio_array = audio_array / np.max(np.abs(audio_array))
247
+
248
+ sf.write(temp_path, audio_array, sample_rate)
249
+
250
+ logger.debug("saved_audio_temp", path=temp_path, sample_rate=sample_rate)
251
+
252
+ return temp_path
253
+
254
+ except ImportError:
255
+ raise ConfigurationError(
256
+ "soundfile not installed. Install with: uv add soundfile"
257
+ ) from None
258
+ except Exception as e:
259
+ logger.error("failed_to_save_audio_temp", error=str(e))
260
+ raise ConfigurationError(f"Failed to save audio to temp file: {e}") from e
261
+
262
+
263
+ @lru_cache(maxsize=1)
264
+ def get_stt_service() -> STTService:
265
+ """Get or create singleton STT service instance.
266
+
267
+ Returns:
268
+ STTService instance
269
+ """
270
+ return STTService()
271
+
src/services/tts_modal.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text-to-Speech service using Kokoro 82M via Modal GPU."""
2
+
3
+ import asyncio
4
+ from functools import lru_cache
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import structlog
9
+
10
+ from src.utils.config import settings
11
+ from src.utils.exceptions import ConfigurationError
12
+
13
+ logger = structlog.get_logger(__name__)
14
+
15
+ # Kokoro TTS dependencies for Modal image
16
+ KOKORO_DEPENDENCIES = [
17
+ "torch>=2.0.0",
18
+ "transformers>=4.30.0",
19
+ "numpy<2.0",
20
+ # kokoro-82M can be installed from source:
21
+ # git+https://github.com/hexgrad/kokoro.git
22
+ ]
23
+
24
+ # Modal app and function definitions (module-level for Modal)
25
+ _modal_app: Any | None = None
26
+ _tts_function: Any | None = None
27
+
28
+
29
+ def _get_modal_app() -> Any:
30
+ """Get or create Modal app instance."""
31
+ global _modal_app
32
+ if _modal_app is None:
33
+ try:
34
+ import modal
35
+
36
+ _modal_app = modal.App.lookup("deepcritical-tts", create_if_missing=True)
37
+ except ImportError as e:
38
+ raise ConfigurationError(
39
+ "Modal SDK not installed. Run: uv sync or pip install modal>=0.63.0"
40
+ ) from e
41
+ return _modal_app
42
+
43
+
44
+ # Define Modal image with Kokoro dependencies (module-level)
45
+ def _get_tts_image() -> Any:
46
+ """Get Modal image with Kokoro dependencies."""
47
+ try:
48
+ import modal
49
+
50
+ return (
51
+ modal.Image.debian_slim(python_version="3.11")
52
+ .pip_install(*KOKORO_DEPENDENCIES)
53
+ .pip_install("git+https://github.com/hexgrad/kokoro.git")
54
+ )
55
+ except ImportError:
56
+ return None
57
+
58
+
59
+ def _setup_modal_function() -> None:
60
+ """Setup Modal GPU function for TTS (called once, lazy initialization).
61
+
62
+ Note: GPU type is set at function definition time. Changes to settings.tts_gpu
63
+ require app restart to take effect.
64
+ """
65
+ global _tts_function, _modal_app
66
+
67
+ if _tts_function is not None:
68
+ return # Already set up
69
+
70
+ try:
71
+ import modal
72
+
73
+ app = _get_modal_app()
74
+ tts_image = _get_tts_image()
75
+
76
+ if tts_image is None:
77
+ raise ConfigurationError("Modal image setup failed")
78
+
79
+ # Get GPU and timeout from settings (with defaults)
80
+ # Note: These are evaluated at function definition time, not at call time
81
+ # Changes to settings require app restart
82
+ gpu_type = getattr(settings, "tts_gpu", None) or "T4"
83
+ timeout_seconds = getattr(settings, "tts_timeout", None) or 60
84
+
85
+ # Define GPU function at module level (required by Modal)
86
+ # Modal functions are immutable once defined, so GPU changes require restart
87
+ @app.function(
88
+ image=tts_image,
89
+ gpu=gpu_type,
90
+ timeout=timeout_seconds,
91
+ )
92
+ def kokoro_tts_function(text: str, voice: str, speed: float) -> tuple[int, np.ndarray]:
93
+ """Modal GPU function for Kokoro TTS.
94
+
95
+ This function runs on Modal's GPU infrastructure.
96
+ Based on: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
97
+ Reference: https://huggingface.co/spaces/hexgrad/Kokoro-TTS/raw/main/app.py
98
+ """
99
+ import numpy as np
100
+
101
+ # Import Kokoro inside function (lazy load)
102
+ try:
103
+ from kokoro import KModel, KPipeline
104
+ import torch
105
+
106
+ # Initialize model (cached on GPU)
107
+ model = KModel().to("cuda").eval()
108
+ pipeline = KPipeline(lang_code=voice[0])
109
+ pack = pipeline.load_voice(voice)
110
+
111
+ # Generate audio
112
+ for _, ps, _ in pipeline(text, voice, speed):
113
+ ref_s = pack[len(ps) - 1]
114
+ audio = model(ps, ref_s, speed)
115
+ return (24000, audio.numpy())
116
+
117
+ # If no audio generated, return empty
118
+ return (24000, np.zeros(1, dtype=np.float32))
119
+
120
+ except ImportError as e:
121
+ raise ConfigurationError(
122
+ "Kokoro not installed. Install with: pip install git+https://github.com/hexgrad/kokoro.git"
123
+ ) from e
124
+ except Exception as e:
125
+ raise ConfigurationError(f"TTS synthesis failed: {e}") from e
126
+
127
+ # Store function reference for remote calls
128
+ _tts_function = kokoro_tts_function
129
+
130
+ # Verify function is properly attached to app
131
+ if not hasattr(app, kokoro_tts_function.__name__):
132
+ logger.warning("modal_function_not_attached", function_name=kokoro_tts_function.__name__)
133
+
134
+ logger.info(
135
+ "modal_tts_function_setup_complete",
136
+ gpu=gpu_type,
137
+ timeout=timeout_seconds,
138
+ function_name=kokoro_tts_function.__name__,
139
+ )
140
+
141
+ except Exception as e:
142
+ logger.error("modal_tts_function_setup_failed", error=str(e))
143
+ raise ConfigurationError(f"Failed to setup Modal TTS function: {e}") from e
144
+
145
+
146
+ class ModalTTSExecutor:
147
+ """Execute Kokoro TTS synthesis on Modal GPU.
148
+
149
+ This class provides TTS synthesis using Kokoro 82M model on Modal's GPU infrastructure.
150
+ Follows the same pattern as ModalCodeExecutor but uses GPU functions for TTS.
151
+ """
152
+
153
+ def __init__(self) -> None:
154
+ """Initialize Modal TTS executor.
155
+
156
+ Note:
157
+ Logs a warning if Modal credentials are not configured.
158
+ Execution will fail at runtime without valid credentials.
159
+ """
160
+ # Check for Modal credentials
161
+ if not settings.modal_available:
162
+ logger.warning(
163
+ "Modal credentials not found. TTS will not be available unless modal setup is run."
164
+ )
165
+
166
+ def synthesize(
167
+ self,
168
+ text: str,
169
+ voice: str = "af_heart",
170
+ speed: float = 1.0,
171
+ timeout: int = 60,
172
+ ) -> tuple[int, np.ndarray]:
173
+ """Synthesize text to speech using Kokoro on Modal GPU.
174
+
175
+ Args:
176
+ text: Text to synthesize (max 5000 chars for free tier)
177
+ voice: Voice ID from Kokoro (e.g., af_heart, af_bella, am_michael)
178
+ speed: Speech speed multiplier (0.5-2.0)
179
+ timeout: Maximum execution time (not used, Modal function has its own timeout)
180
+
181
+ Returns:
182
+ Tuple of (sample_rate, audio_array)
183
+
184
+ Raises:
185
+ ConfigurationError: If synthesis fails
186
+ """
187
+ # Setup Modal function if not already done
188
+ _setup_modal_function()
189
+
190
+ if _tts_function is None:
191
+ raise ConfigurationError("Modal TTS function not initialized")
192
+
193
+ logger.info("synthesizing_tts", text_length=len(text), voice=voice, speed=speed)
194
+
195
+ try:
196
+ # Call the GPU function remotely
197
+ result = _tts_function.remote(text, voice, speed)
198
+
199
+ logger.info("tts_synthesis_complete", sample_rate=result[0], audio_shape=result[1].shape)
200
+
201
+ return result
202
+
203
+ except Exception as e:
204
+ logger.error("tts_synthesis_failed", error=str(e), error_type=type(e).__name__)
205
+ raise ConfigurationError(f"TTS synthesis failed: {e}") from e
206
+
207
+
208
+ class TTSService:
209
+ """TTS service wrapper for async usage."""
210
+
211
+ def __init__(self) -> None:
212
+ """Initialize TTS service."""
213
+ if not settings.modal_available:
214
+ raise ConfigurationError("Modal credentials required for TTS")
215
+ self.executor = ModalTTSExecutor()
216
+
217
+ async def synthesize_async(
218
+ self,
219
+ text: str,
220
+ voice: str = "af_heart",
221
+ speed: float = 1.0,
222
+ ) -> tuple[int, np.ndarray] | None:
223
+ """Async wrapper for TTS synthesis.
224
+
225
+ Args:
226
+ text: Text to synthesize
227
+ voice: Voice ID (default: settings.tts_voice)
228
+ speed: Speech speed (default: settings.tts_speed)
229
+
230
+ Returns:
231
+ Tuple of (sample_rate, audio_array) or None if error
232
+ """
233
+ voice = voice or settings.tts_voice
234
+ speed = speed or settings.tts_speed
235
+
236
+ loop = asyncio.get_running_loop()
237
+
238
+ try:
239
+ result = await loop.run_in_executor(
240
+ None,
241
+ lambda: self.executor.synthesize(text, voice, speed),
242
+ )
243
+ return result
244
+ except Exception as e:
245
+ logger.error("tts_synthesis_async_failed", error=str(e))
246
+ return None
247
+
248
+
249
+ @lru_cache(maxsize=1)
250
+ def get_tts_service() -> TTSService:
251
+ """Get or create singleton TTS service instance.
252
+
253
+ Returns:
254
+ TTSService instance
255
+
256
+ Raises:
257
+ ConfigurationError: If Modal credentials not configured
258
+ """
259
+ return TTSService()
260
+
src/tools/crawl_adapter.py CHANGED
@@ -60,3 +60,5 @@ async def crawl_website(starting_url: str) -> str:
60
 
61
 
62
 
 
 
 
60
 
61
 
62
 
63
+
64
+
src/tools/rag_tool.py CHANGED
@@ -23,14 +23,20 @@ class RAGTool:
23
  Returns Evidence objects from RAG retrieval results.
24
  """
25
 
26
- def __init__(self, rag_service: "LlamaIndexRAGService | None" = None) -> None:
 
 
 
 
27
  """
28
  Initialize RAG tool.
29
 
30
  Args:
31
  rag_service: Optional RAG service instance. If None, will be lazy-initialized.
 
32
  """
33
  self._rag_service = rag_service
 
34
  self.logger = logger
35
 
36
  @property
@@ -54,9 +60,11 @@ class RAGTool:
54
 
55
  # Use local embeddings by default (no API key required)
56
  # Use in-memory ChromaDB to avoid file system issues
 
57
  self._rag_service = get_rag_service(
58
  use_openai_embeddings=False,
59
  use_in_memory=True, # Use in-memory for better reliability
 
60
  )
61
  self.logger.info("RAG service initialized with local embeddings")
62
  except (ConfigurationError, ImportError) as e:
@@ -170,12 +178,14 @@ class RAGTool:
170
 
171
  def create_rag_tool(
172
  rag_service: "LlamaIndexRAGService | None" = None,
 
173
  ) -> RAGTool:
174
  """
175
  Factory function to create a RAG tool.
176
 
177
  Args:
178
  rag_service: Optional RAG service instance. If None, will be lazy-initialized.
 
179
 
180
  Returns:
181
  Configured RAGTool instance
@@ -184,7 +194,7 @@ def create_rag_tool(
184
  ConfigurationError: If RAG service cannot be initialized and rag_service is None
185
  """
186
  try:
187
- return RAGTool(rag_service=rag_service)
188
  except Exception as e:
189
  logger.error("Failed to create RAG tool", error=str(e))
190
  raise ConfigurationError(f"Failed to create RAG tool: {e}") from e
 
23
  Returns Evidence objects from RAG retrieval results.
24
  """
25
 
26
+ def __init__(
27
+ self,
28
+ rag_service: "LlamaIndexRAGService | None" = None,
29
+ oauth_token: str | None = None,
30
+ ) -> None:
31
  """
32
  Initialize RAG tool.
33
 
34
  Args:
35
  rag_service: Optional RAG service instance. If None, will be lazy-initialized.
36
+ oauth_token: Optional OAuth token from HuggingFace login (for RAG LLM)
37
  """
38
  self._rag_service = rag_service
39
+ self.oauth_token = oauth_token
40
  self.logger = logger
41
 
42
  @property
 
60
 
61
  # Use local embeddings by default (no API key required)
62
  # Use in-memory ChromaDB to avoid file system issues
63
+ # Pass OAuth token for LLM query synthesis
64
  self._rag_service = get_rag_service(
65
  use_openai_embeddings=False,
66
  use_in_memory=True, # Use in-memory for better reliability
67
+ oauth_token=self.oauth_token,
68
  )
69
  self.logger.info("RAG service initialized with local embeddings")
70
  except (ConfigurationError, ImportError) as e:
 
178
 
179
  def create_rag_tool(
180
  rag_service: "LlamaIndexRAGService | None" = None,
181
+ oauth_token: str | None = None,
182
  ) -> RAGTool:
183
  """
184
  Factory function to create a RAG tool.
185
 
186
  Args:
187
  rag_service: Optional RAG service instance. If None, will be lazy-initialized.
188
+ oauth_token: Optional OAuth token from HuggingFace login (for RAG LLM)
189
 
190
  Returns:
191
  Configured RAGTool instance
 
194
  ConfigurationError: If RAG service cannot be initialized and rag_service is None
195
  """
196
  try:
197
+ return RAGTool(rag_service=rag_service, oauth_token=oauth_token)
198
  except Exception as e:
199
  logger.error("Failed to create RAG tool", error=str(e))
200
  raise ConfigurationError(f"Failed to create RAG tool: {e}") from e
src/tools/search_handler.py CHANGED
@@ -27,6 +27,7 @@ class SearchHandler:
27
  timeout: float = 30.0,
28
  include_rag: bool = False,
29
  auto_ingest_to_rag: bool = True,
 
30
  ) -> None:
31
  """
32
  Initialize the search handler.
@@ -36,10 +37,12 @@ class SearchHandler:
36
  timeout: Timeout for each search in seconds
37
  include_rag: Whether to include RAG tool in searches
38
  auto_ingest_to_rag: Whether to automatically ingest results into RAG
 
39
  """
40
  self.tools = list(tools) # Make a copy
41
  self.timeout = timeout
42
  self.auto_ingest_to_rag = auto_ingest_to_rag
 
43
  self._rag_service: LlamaIndexRAGService | None = None
44
 
45
  if include_rag:
@@ -48,7 +51,7 @@ class SearchHandler:
48
  def add_rag_tool(self) -> None:
49
  """Add RAG tool to the tools list if available."""
50
  try:
51
- rag_tool = create_rag_tool()
52
  self.tools.append(rag_tool)
53
  logger.info("RAG tool added to search handler")
54
  except ConfigurationError:
@@ -67,9 +70,11 @@ class SearchHandler:
67
 
68
  # Use local embeddings by default (no API key required)
69
  # Use in-memory ChromaDB to avoid file system issues
 
70
  self._rag_service = get_rag_service(
71
  use_openai_embeddings=False,
72
  use_in_memory=True, # Use in-memory for better reliability
 
73
  )
74
  logger.info("RAG service initialized for ingestion with local embeddings")
75
  except (ConfigurationError, ImportError):
 
27
  timeout: float = 30.0,
28
  include_rag: bool = False,
29
  auto_ingest_to_rag: bool = True,
30
+ oauth_token: str | None = None,
31
  ) -> None:
32
  """
33
  Initialize the search handler.
 
37
  timeout: Timeout for each search in seconds
38
  include_rag: Whether to include RAG tool in searches
39
  auto_ingest_to_rag: Whether to automatically ingest results into RAG
40
+ oauth_token: Optional OAuth token from HuggingFace login (for RAG LLM)
41
  """
42
  self.tools = list(tools) # Make a copy
43
  self.timeout = timeout
44
  self.auto_ingest_to_rag = auto_ingest_to_rag
45
+ self.oauth_token = oauth_token
46
  self._rag_service: LlamaIndexRAGService | None = None
47
 
48
  if include_rag:
 
51
  def add_rag_tool(self) -> None:
52
  """Add RAG tool to the tools list if available."""
53
  try:
54
+ rag_tool = create_rag_tool(oauth_token=self.oauth_token)
55
  self.tools.append(rag_tool)
56
  logger.info("RAG tool added to search handler")
57
  except ConfigurationError:
 
70
 
71
  # Use local embeddings by default (no API key required)
72
  # Use in-memory ChromaDB to avoid file system issues
73
+ # Pass OAuth token for LLM query synthesis
74
  self._rag_service = get_rag_service(
75
  use_openai_embeddings=False,
76
  use_in_memory=True, # Use in-memory for better reliability
77
+ oauth_token=self.oauth_token,
78
  )
79
  logger.info("RAG service initialized for ingestion with local embeddings")
80
  except (ConfigurationError, ImportError):
src/tools/web_search_adapter.py CHANGED
@@ -65,3 +65,5 @@ async def web_search(query: str) -> str:
65
 
66
 
67
 
 
 
 
65
 
66
 
67
 
68
+
69
+
src/utils/config.py CHANGED
@@ -140,6 +140,62 @@ class Settings(BaseSettings):
140
  description="Automatically ingest evidence into RAG",
141
  )
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  @property
144
  def modal_available(self) -> bool:
145
  """Check if Modal credentials are configured."""
@@ -203,6 +259,16 @@ class Settings(BaseSettings):
203
  return bool(self.tavily_api_key)
204
  return False
205
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def get_settings() -> Settings:
208
  """Factory function to get settings (allows mocking in tests)."""
 
140
  description="Automatically ingest evidence into RAG",
141
  )
142
 
143
+ # Audio Processing Configuration
144
+ tts_model: str = Field(
145
+ default="hexgrad/Kokoro-82M",
146
+ description="Kokoro TTS model ID for text-to-speech",
147
+ )
148
+ tts_voice: str = Field(
149
+ default="af_heart",
150
+ description="Kokoro voice ID (e.g., af_heart, af_bella, am_michael)",
151
+ )
152
+ tts_speed: float = Field(
153
+ default=1.0,
154
+ ge=0.5,
155
+ le=2.0,
156
+ description="TTS speech speed multiplier",
157
+ )
158
+ tts_gpu: str | None = Field(
159
+ default="T4",
160
+ description="Modal GPU type for TTS (T4, A10, A100, etc.)",
161
+ )
162
+ tts_timeout: int = Field(
163
+ default=60,
164
+ ge=10,
165
+ le=300,
166
+ description="TTS synthesis timeout in seconds",
167
+ )
168
+ stt_api_url: str = Field(
169
+ default="nvidia/canary-1b-v2",
170
+ description="Gradio Space URL for STT API (nvidia/canary-1b-v2)",
171
+ )
172
+ stt_source_lang: str = Field(
173
+ default="English",
174
+ description="Source language for STT transcription",
175
+ )
176
+ stt_target_lang: str = Field(
177
+ default="English",
178
+ description="Target language for STT transcription",
179
+ )
180
+ enable_audio_input: bool = Field(
181
+ default=True,
182
+ description="Enable audio input (microphone/file upload)",
183
+ )
184
+ enable_audio_output: bool = Field(
185
+ default=True,
186
+ description="Enable audio output (TTS response)",
187
+ )
188
+
189
+ # Image OCR Configuration
190
+ ocr_api_url: str = Field(
191
+ default="prithivMLmods/Multimodal-OCR3",
192
+ description="Gradio Space URL for image OCR API",
193
+ )
194
+ enable_image_input: bool = Field(
195
+ default=True,
196
+ description="Enable image input (file upload with OCR)",
197
+ )
198
+
199
  @property
200
  def modal_available(self) -> bool:
201
  """Check if Modal credentials are configured."""
 
259
  return bool(self.tavily_api_key)
260
  return False
261
 
262
+ @property
263
+ def audio_available(self) -> bool:
264
+ """Check if audio processing is available (Modal + STT API)."""
265
+ return self.modal_available and bool(self.stt_api_url)
266
+
267
+ @property
268
+ def image_ocr_available(self) -> bool:
269
+ """Check if image OCR is available (OCR API URL configured)."""
270
+ return bool(self.ocr_api_url)
271
+
272
 
273
  def get_settings() -> Settings:
274
  """Factory function to get settings (allows mocking in tests)."""
src/utils/llm_factory.py CHANGED
@@ -50,13 +50,16 @@ def get_magentic_client() -> "OpenAIChatClient":
50
  )
51
 
52
 
53
- def get_huggingface_chat_client() -> "HuggingFaceChatClient":
54
  """
55
  Get HuggingFace chat client for agent-framework.
56
 
57
  HuggingFace InferenceClient natively supports function calling,
58
  making it compatible with agent-framework's ChatAgent.
59
 
 
 
 
60
  Returns:
61
  Configured HuggingFaceChatClient
62
 
@@ -66,7 +69,8 @@ def get_huggingface_chat_client() -> "HuggingFaceChatClient":
66
  from src.utils.huggingface_chat_client import HuggingFaceChatClient
67
 
68
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
69
- api_key = settings.hf_token or settings.huggingface_api_key
 
70
 
71
  return HuggingFaceChatClient(
72
  model_name=model_name,
@@ -75,7 +79,7 @@ def get_huggingface_chat_client() -> "HuggingFaceChatClient":
75
  )
76
 
77
 
78
- def get_chat_client_for_agent() -> Any:
79
  """
80
  Get appropriate chat client for agent-framework based on configuration.
81
 
@@ -83,15 +87,21 @@ def get_chat_client_for_agent() -> Any:
83
  - HuggingFace InferenceClient (if HF_TOKEN available, preferred for free tier)
84
  - OpenAI ChatClient (if OPENAI_API_KEY available, fallback)
85
 
 
 
 
86
  Returns:
87
  ChatClient compatible with agent-framework (HuggingFaceChatClient or OpenAIChatClient)
88
 
89
  Raises:
90
  ConfigurationError: If no suitable client can be created
91
  """
 
 
 
92
  # Prefer HuggingFace if available (free tier)
93
- if settings.has_huggingface_key:
94
- return get_huggingface_chat_client()
95
 
96
  # Fallback to OpenAI if available
97
  if settings.has_openai_key:
@@ -99,7 +109,7 @@ def get_chat_client_for_agent() -> Any:
99
 
100
  # If neither available, try HuggingFace without key (public models)
101
  try:
102
- return get_huggingface_chat_client()
103
  except Exception:
104
  pass
105
 
@@ -108,7 +118,7 @@ def get_chat_client_for_agent() -> Any:
108
  )
109
 
110
 
111
- def get_pydantic_ai_model() -> Any:
112
  """
113
  Get the appropriate model for pydantic-ai based on configuration.
114
 
@@ -116,6 +126,9 @@ def get_pydantic_ai_model() -> Any:
116
  Defaults to HuggingFace if provider is not specified or unknown.
117
  This is used by simple mode components (JudgeHandler, etc.)
118
 
 
 
 
119
  Returns:
120
  Configured pydantic-ai model
121
  """
@@ -126,9 +139,12 @@ def get_pydantic_ai_model() -> Any:
126
  from pydantic_ai.providers.huggingface import HuggingFaceProvider
127
  from pydantic_ai.providers.openai import OpenAIProvider
128
 
 
 
 
129
  if settings.llm_provider == "huggingface":
130
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
131
- hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
132
  return HuggingFaceModel(model_name, provider=hf_provider)
133
 
134
  if settings.llm_provider == "openai":
@@ -145,7 +161,7 @@ def get_pydantic_ai_model() -> Any:
145
 
146
  # Default to HuggingFace if provider is unknown or not specified
147
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
148
- hf_provider = HuggingFaceProvider(api_key=settings.hf_token)
149
  return HuggingFaceModel(model_name, provider=hf_provider)
150
 
151
 
 
50
  )
51
 
52
 
53
+ def get_huggingface_chat_client(oauth_token: str | None = None) -> "HuggingFaceChatClient":
54
  """
55
  Get HuggingFace chat client for agent-framework.
56
 
57
  HuggingFace InferenceClient natively supports function calling,
58
  making it compatible with agent-framework's ChatAgent.
59
 
60
+ Args:
61
+ oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
62
+
63
  Returns:
64
  Configured HuggingFaceChatClient
65
 
 
69
  from src.utils.huggingface_chat_client import HuggingFaceChatClient
70
 
71
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
72
+ # Priority: oauth_token > env vars
73
+ api_key = oauth_token or settings.hf_token or settings.huggingface_api_key
74
 
75
  return HuggingFaceChatClient(
76
  model_name=model_name,
 
79
  )
80
 
81
 
82
+ def get_chat_client_for_agent(oauth_token: str | None = None) -> Any:
83
  """
84
  Get appropriate chat client for agent-framework based on configuration.
85
 
 
87
  - HuggingFace InferenceClient (if HF_TOKEN available, preferred for free tier)
88
  - OpenAI ChatClient (if OPENAI_API_KEY available, fallback)
89
 
90
+ Args:
91
+ oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
92
+
93
  Returns:
94
  ChatClient compatible with agent-framework (HuggingFaceChatClient or OpenAIChatClient)
95
 
96
  Raises:
97
  ConfigurationError: If no suitable client can be created
98
  """
99
+ # Check if we have OAuth token or env vars
100
+ has_hf_key = bool(oauth_token or settings.has_huggingface_key)
101
+
102
  # Prefer HuggingFace if available (free tier)
103
+ if has_hf_key:
104
+ return get_huggingface_chat_client(oauth_token=oauth_token)
105
 
106
  # Fallback to OpenAI if available
107
  if settings.has_openai_key:
 
109
 
110
  # If neither available, try HuggingFace without key (public models)
111
  try:
112
+ return get_huggingface_chat_client(oauth_token=oauth_token)
113
  except Exception:
114
  pass
115
 
 
118
  )
119
 
120
 
121
+ def get_pydantic_ai_model(oauth_token: str | None = None) -> Any:
122
  """
123
  Get the appropriate model for pydantic-ai based on configuration.
124
 
 
126
  Defaults to HuggingFace if provider is not specified or unknown.
127
  This is used by simple mode components (JudgeHandler, etc.)
128
 
129
+ Args:
130
+ oauth_token: Optional OAuth token from HuggingFace login (takes priority over env vars)
131
+
132
  Returns:
133
  Configured pydantic-ai model
134
  """
 
139
  from pydantic_ai.providers.huggingface import HuggingFaceProvider
140
  from pydantic_ai.providers.openai import OpenAIProvider
141
 
142
+ # Priority: oauth_token > env vars
143
+ effective_hf_token = oauth_token or settings.hf_token or settings.huggingface_api_key
144
+
145
  if settings.llm_provider == "huggingface":
146
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
147
+ hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
148
  return HuggingFaceModel(model_name, provider=hf_provider)
149
 
150
  if settings.llm_provider == "openai":
 
161
 
162
  # Default to HuggingFace if provider is unknown or not specified
163
  model_name = settings.huggingface_model or "meta-llama/Llama-3.1-8B-Instruct"
164
+ hf_provider = HuggingFaceProvider(api_key=effective_hf_token)
165
  return HuggingFaceModel(model_name, provider=hf_provider)
166
 
167
 
tests/unit/middleware/__init__.py CHANGED
@@ -17,3 +17,5 @@
17
 
18
 
19
 
 
 
 
17
 
18
 
19
 
20
+
21
+