Update app/providers.py

#2
by BadTin - opened
Files changed (1) hide show
  1. app/providers.py +160 -34
app/providers.py CHANGED
@@ -1,6 +1,6 @@
1
  # =============================================================================
2
  # app/providers.py
3
- # 09.03.2026
4
  # LLM + Search Provider Registry + Fallback Chain
5
  # Universal MCP Hub (Sandboxed) - based on PyFundaments Architecture
6
  # Copyright 2026 - Volkan Kücükbudak
@@ -27,6 +27,21 @@
27
  # All errors are sanitized before propagation — only HTTP status codes
28
  # and safe_url (query params stripped) are ever exposed in logs.
29
  #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # HOW TO ADD A NEW LLM PROVIDER — 3 steps, nothing else to touch:
31
  # 1. Add class below (copy a dummy, implement complete())
32
  # 2. Register name → class in _PROVIDER_CLASSES dict
@@ -60,12 +75,13 @@ class BaseProvider:
60
  Subclasses only implement complete() — HTTP logic lives here.
61
  """
62
  def __init__(self, name: str, cfg: dict):
63
- self.name = name
64
- self.key = os.getenv(cfg.get("env_key", ""))
65
- self.base_url = cfg.get("base_url", "")
66
- self.fallback = cfg.get("fallback_to", "")
67
- self.timeout = int(config.get_limits().get("REQUEST_TIMEOUT_SEC", "60"))
68
- self.model = cfg.get("default_model", "")
 
69
  # Safe key hint for debug logs — never log the full key
70
  self._key_hint = (
71
  f"{self.key[:4]}...{self.key[-4:]}"
@@ -106,6 +122,7 @@ class BaseProvider:
106
  # SECTION 2 — LLM Provider Implementations
107
  # Only the API-specific parsing logic differs per provider.
108
  # =============================================================================
 
109
  # --- SmolLM2 (Custom Assistant Space) ----------------------------------------
110
  class SmolLMProvider(BaseProvider):
111
  """
@@ -130,7 +147,7 @@ class SmolLMProvider(BaseProvider):
130
  f"{self.base_url}/chat/completions",
131
  headers={
132
  "Authorization": f"Bearer {self.key}",
133
- "X-IP-Token": self.key,
134
  "content-type": "application/json",
135
  },
136
  payload={
@@ -142,38 +159,129 @@ class SmolLMProvider(BaseProvider):
142
  return data["choices"][0]["message"]["content"]
143
 
144
 
 
 
 
 
145
 
 
 
 
 
 
 
146
 
 
 
 
 
 
 
 
 
 
 
 
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
 
150
- class AnthropicProvider(BaseProvider):
151
- """Anthropic Claude API — Messages endpoint."""
 
 
152
 
153
- async def complete(self, prompt: str, model: str = None, max_tokens: int = 1024) -> str:
154
- cfg = config.get_active_llm_providers().get("anthropic", {})
155
- data = await self._post(
156
- f"{self.base_url}/messages",
157
- headers={
158
- "x-api-key": self.key,
159
- "anthropic-version": cfg.get("api_version_header", "2023-06-01"),
160
- "content-type": "application/json",
161
- },
162
- payload={
163
- "model": model or self.model,
164
- "max_tokens": max_tokens,
165
- "messages": [{"role": "user", "content": prompt}],
166
- },
167
- )
168
- return data["content"][0]["text"]
169
 
 
 
 
 
 
 
170
 
171
- class GeminiProvider(BaseProvider):
172
- """Google Gemini API — generateContent endpoint."""
 
 
 
 
 
 
 
 
173
 
174
  async def complete(self, prompt: str, model: str = None, max_tokens: int = 1024) -> str:
175
  m = model or self.model
176
  safe_url = f"{self.base_url}/models/{m}:generateContent"
 
 
 
 
177
  async with httpx.AsyncClient() as client:
178
  r = await client.post(
179
  safe_url,
@@ -193,9 +301,10 @@ class GeminiProvider(BaseProvider):
193
  return r.json()["candidates"][0]["content"]["parts"][0]["text"]
194
 
195
 
 
196
  class OpenRouterProvider(BaseProvider):
197
  """OpenRouter API — OpenAI-compatible chat completions endpoint.
198
-
199
  Required headers: HTTP-Referer + X-Title (required by OpenRouter for
200
  free models and rate limit attribution).
201
  """
@@ -206,7 +315,7 @@ class OpenRouterProvider(BaseProvider):
206
  headers={
207
  "Authorization": f"Bearer {self.key}",
208
  "HTTP-Referer": os.getenv("APP_URL", "https://huggingface.co"),
209
- "X-Title": os.getenv("HUB_NAME", "Universal MCP Hub"), # required!
210
  "content-type": "application/json",
211
  },
212
  payload={
@@ -218,9 +327,10 @@ class OpenRouterProvider(BaseProvider):
218
  return data["choices"][0]["message"]["content"]
219
 
220
 
 
221
  class HuggingFaceProvider(BaseProvider):
222
  """HuggingFace Inference API — OpenAI-compatible serverless endpoint.
223
-
224
  base_url in .pyfun: https://api-inference.huggingface.co/v1
225
  Model goes in payload, not in URL.
226
  Free tier: max ~8B models. PRO required for 70B+.
@@ -381,7 +491,8 @@ def initialize() -> None:
381
  logger.info(f"Provider '{name}' has no handler yet — skipped.")
382
  continue
383
  _registry[name] = cls(name, cfg)
384
- logger.info(f"Provider registered: {name}")
 
385
 
386
 
387
  # =============================================================================
@@ -393,6 +504,7 @@ async def llm_complete(
393
  provider_name: str = None,
394
  model: str = None,
395
  max_tokens: int = 1024,
 
396
  ) -> str:
397
  """
398
  Send prompt to LLM provider with automatic fallback chain.
@@ -405,6 +517,9 @@ async def llm_complete(
405
  from .pyfun [TOOL.llm_complete].
406
  model: Model name override. Defaults to provider's default_model.
407
  max_tokens: Max tokens in response. Default: 1024.
 
 
 
408
 
409
  Returns:
410
  Model response as plain text string.
@@ -424,7 +539,18 @@ async def llm_complete(
424
  logger.warning(f"Provider '{current}' not in registry — trying fallback.")
425
  else:
426
  try:
427
- result = await provider.complete(prompt, model, max_tokens)
 
 
 
 
 
 
 
 
 
 
 
428
  logger.info(f"Response from provider: '{current}'")
429
  return f"[{current}] {result}"
430
  except Exception as e:
@@ -502,4 +628,4 @@ def get(name: str) -> BaseProvider:
502
  # =============================================================================
503
 
504
  if __name__ == "__main__":
505
- print("WARNING: Run via main.py → app.py, not directly.")
 
1
  # =============================================================================
2
  # app/providers.py
3
+ # 09.03.2026 | updated 23.03.2026
4
  # LLM + Search Provider Registry + Fallback Chain
5
  # Universal MCP Hub (Sandboxed) - based on PyFundaments Architecture
6
  # Copyright 2026 - Volkan Kücükbudak
 
27
  # All errors are sanitized before propagation — only HTTP status codes
28
  # and safe_url (query params stripped) are ever exposed in logs.
29
  #
30
+ # CACHING NOTE:
31
+ # Anthropic → prompt_caching (cache_control: ephemeral)
32
+ # Requires anthropic-beta: prompt-caching-2024-07-31 header.
33
+ # Caches system prompt + long user prompts (>1024 tokens estimated).
34
+ # Saves up to 90% input token costs on repeated context.
35
+ # Enable per provider in .pyfun: supports_cache = "true"
36
+ #
37
+ # Gemini → Implicit caching (automatic, no extra API call needed)
38
+ # Google automatically caches repeated prompt prefixes server-side.
39
+ # No code change needed — Gemini handles it transparently.
40
+ # Explicit Context Caching API exists but requires separate cache management
41
+ # and is only worth it for very large static contexts (32k+ tokens).
42
+ # Enable per provider in .pyfun: supports_cache = "true"
43
+ # (currently used as log hint only for Gemini — implicit cache is always on)
44
+ #
45
  # HOW TO ADD A NEW LLM PROVIDER — 3 steps, nothing else to touch:
46
  # 1. Add class below (copy a dummy, implement complete())
47
  # 2. Register name → class in _PROVIDER_CLASSES dict
 
75
  Subclasses only implement complete() — HTTP logic lives here.
76
  """
77
  def __init__(self, name: str, cfg: dict):
78
+ self.name = name
79
+ self.key = os.getenv(cfg.get("env_key", ""))
80
+ self.base_url = cfg.get("base_url", "")
81
+ self.fallback = cfg.get("fallback_to", "")
82
+ self.timeout = int(config.get_limits().get("REQUEST_TIMEOUT_SEC", "60"))
83
+ self.model = cfg.get("default_model", "")
84
+ self.supports_cache = cfg.get("supports_cache", "false").lower() == "true"
85
  # Safe key hint for debug logs — never log the full key
86
  self._key_hint = (
87
  f"{self.key[:4]}...{self.key[-4:]}"
 
122
  # SECTION 2 — LLM Provider Implementations
123
  # Only the API-specific parsing logic differs per provider.
124
  # =============================================================================
125
+
126
  # --- SmolLM2 (Custom Assistant Space) ----------------------------------------
127
  class SmolLMProvider(BaseProvider):
128
  """
 
147
  f"{self.base_url}/chat/completions",
148
  headers={
149
  "Authorization": f"Bearer {self.key}",
150
+ "X-IP-Token": self.key,
151
  "content-type": "application/json",
152
  },
153
  payload={
 
159
  return data["choices"][0]["message"]["content"]
160
 
161
 
162
+ # --- Anthropic ----------------------------------------------------------------
163
+ class AnthropicProvider(BaseProvider):
164
+ """
165
+ Anthropic Claude API — Messages endpoint.
166
 
167
+ Prompt Caching (supports_cache = "true" in .pyfun):
168
+ Uses cache_control: ephemeral on system prompt and long user prompts.
169
+ Requires anthropic-beta: prompt-caching-2024-07-31 header.
170
+ Cache TTL: 5 minutes, extended on each cache hit.
171
+ Min tokens to cache: ~1024 (Anthropic requirement).
172
+ Cost: cache write ~25% more, cache read ~90% less than normal input.
173
 
174
+ .pyfun block:
175
+ [LLM_PROVIDER.anthropic]
176
+ active = "true"
177
+ base_url = "https://api.anthropic.com/v1"
178
+ env_key = "ANTHROPIC_API_KEY"
179
+ api_version_header = "2023-06-01"
180
+ default_model = "claude-haiku-4-5"
181
+ supports_cache = "true"
182
+ fallback_to = "gemini"
183
+ [LLM_PROVIDER.anthropic_END]
184
+ """
185
 
186
+ # Rough chars-per-token estimate — avoids importing tiktoken in sandbox
187
+ _CHARS_PER_TOKEN = 4
188
+ _CACHE_MIN_TOKENS = 1024
189
+
190
+ def _is_cacheable(self, text: str) -> bool:
191
+ """Estimate if text is long enough to benefit from caching."""
192
+ return len(text) >= self._CACHE_MIN_TOKENS * self._CHARS_PER_TOKEN
193
+
194
+ async def complete(
195
+ self,
196
+ prompt: str,
197
+ model: str = None,
198
+ max_tokens: int = 1024,
199
+ system: str = None,
200
+ ) -> str:
201
+ cfg = config.get_active_llm_providers().get("anthropic", {})
202
+
203
+ headers = {
204
+ "x-api-key": self.key,
205
+ "anthropic-version": cfg.get("api_version_header", "2023-06-01"),
206
+ "content-type": "application/json",
207
+ }
208
+
209
+ # --- Build user content ---
210
+ # Add cache_control if caching enabled + prompt long enough
211
+ if self.supports_cache and self._is_cacheable(prompt):
212
+ user_content = [
213
+ {
214
+ "type": "text",
215
+ "text": prompt,
216
+ "cache_control": {"type": "ephemeral"},
217
+ }
218
+ ]
219
+ headers["anthropic-beta"] = "prompt-caching-2024-07-31"
220
+ logger.debug("Anthropic: prompt cache_control applied to user message.")
221
+ else:
222
+ user_content = prompt # short prompt — plain string, no overhead
223
+
224
+ payload = {
225
+ "model": model or self.model,
226
+ "max_tokens": max_tokens,
227
+ "messages": [{"role": "user", "content": user_content}],
228
+ }
229
+
230
+ # --- Optional system prompt with cache_control ---
231
+ if system:
232
+ if self.supports_cache and self._is_cacheable(system):
233
+ payload["system"] = [
234
+ {
235
+ "type": "text",
236
+ "text": system,
237
+ "cache_control": {"type": "ephemeral"},
238
+ }
239
+ ]
240
+ headers["anthropic-beta"] = "prompt-caching-2024-07-31"
241
+ logger.debug("Anthropic: prompt cache_control applied to system prompt.")
242
+ else:
243
+ payload["system"] = system
244
+
245
+ data = await self._post(f"{self.base_url}/messages", headers, payload)
246
+ return data["content"][0]["text"]
247
 
248
 
249
+ # --- Gemini ------------------------------------------------------------------
250
+ class GeminiProvider(BaseProvider):
251
+ """
252
+ Google Gemini API — generateContent endpoint.
253
 
254
+ Implicit Caching (always active on Gemini side, no code needed):
255
+ Google automatically caches repeated prompt prefixes server-side.
256
+ No extra API call, no cache key, no TTL management needed.
257
+ Just send the same prompt structure and Gemini handles the rest.
258
+ supports_cache = "true" in .pyfun logs cache hint only.
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ Explicit Context Caching (NOT implemented here — when to use it):
261
+ Only worth the extra API complexity for very large static contexts
262
+ (32k+ tokens, e.g. large documents sent on every request).
263
+ Requires separate POST to /cachedContents, returns a cache_name,
264
+ which is then referenced in generateContent as cachedContent.name.
265
+ Implement as a separate tool (cache_create / cache_use) when needed.
266
 
267
+ .pyfun block:
268
+ [LLM_PROVIDER.gemini]
269
+ active = "true"
270
+ base_url = "https://generativelanguage.googleapis.com/v1beta"
271
+ env_key = "GEMINI_API_KEY"
272
+ default_model = "gemini-2.0-flash"
273
+ supports_cache = "true"
274
+ fallback_to = "openrouter"
275
+ [LLM_PROVIDER.gemini_END]
276
+ """
277
 
278
  async def complete(self, prompt: str, model: str = None, max_tokens: int = 1024) -> str:
279
  m = model or self.model
280
  safe_url = f"{self.base_url}/models/{m}:generateContent"
281
+
282
+ if self.supports_cache:
283
+ logger.debug(f"Gemini: implicit caching active for model {m} (server-side, automatic).")
284
+
285
  async with httpx.AsyncClient() as client:
286
  r = await client.post(
287
  safe_url,
 
301
  return r.json()["candidates"][0]["content"]["parts"][0]["text"]
302
 
303
 
304
+ # --- OpenRouter ---------------------------------------------------------------
305
  class OpenRouterProvider(BaseProvider):
306
  """OpenRouter API — OpenAI-compatible chat completions endpoint.
307
+
308
  Required headers: HTTP-Referer + X-Title (required by OpenRouter for
309
  free models and rate limit attribution).
310
  """
 
315
  headers={
316
  "Authorization": f"Bearer {self.key}",
317
  "HTTP-Referer": os.getenv("APP_URL", "https://huggingface.co"),
318
+ "X-Title": os.getenv("HUB_NAME", "Universal AI Hub"), # required!
319
  "content-type": "application/json",
320
  },
321
  payload={
 
327
  return data["choices"][0]["message"]["content"]
328
 
329
 
330
+ # --- HuggingFace --------------------------------------------------------------
331
  class HuggingFaceProvider(BaseProvider):
332
  """HuggingFace Inference API — OpenAI-compatible serverless endpoint.
333
+
334
  base_url in .pyfun: https://api-inference.huggingface.co/v1
335
  Model goes in payload, not in URL.
336
  Free tier: max ~8B models. PRO required for 70B+.
 
491
  logger.info(f"Provider '{name}' has no handler yet — skipped.")
492
  continue
493
  _registry[name] = cls(name, cfg)
494
+ cache_hint = " [cache: ON]" if cfg.get("supports_cache", "false") == "true" else ""
495
+ logger.info(f"Provider registered: {name}{cache_hint}")
496
 
497
 
498
  # =============================================================================
 
504
  provider_name: str = None,
505
  model: str = None,
506
  max_tokens: int = 1024,
507
+ system: str = None,
508
  ) -> str:
509
  """
510
  Send prompt to LLM provider with automatic fallback chain.
 
517
  from .pyfun [TOOL.llm_complete].
518
  model: Model name override. Defaults to provider's default_model.
519
  max_tokens: Max tokens in response. Default: 1024.
520
+ system: Optional system prompt. Passed to providers that support it.
521
+ AnthropicProvider caches it automatically if supports_cache = true
522
+ and the system prompt is long enough (>= ~1024 tokens).
523
 
524
  Returns:
525
  Model response as plain text string.
 
539
  logger.warning(f"Provider '{current}' not in registry — trying fallback.")
540
  else:
541
  try:
542
+ # Pass system prompt if provider supports it (Anthropic)
543
+ # Other providers accept **kwargs and ignore unknown params safely
544
+ if system is not None and hasattr(provider, 'complete'):
545
+ import inspect
546
+ sig = inspect.signature(provider.complete)
547
+ if 'system' in sig.parameters:
548
+ result = await provider.complete(prompt, model, max_tokens, system=system)
549
+ else:
550
+ result = await provider.complete(prompt, model, max_tokens)
551
+ else:
552
+ result = await provider.complete(prompt, model, max_tokens)
553
+
554
  logger.info(f"Response from provider: '{current}'")
555
  return f"[{current}] {result}"
556
  except Exception as e:
 
628
  # =============================================================================
629
 
630
  if __name__ == "__main__":
631
+ print("WARNING: Run via main.py → app.py, not directly.")