| |
| import asyncio |
| import inspect |
| import logging |
| from dataclasses import dataclass |
| from typing import Any, Dict, List, Optional, AsyncGenerator,Tuple |
| import time |
| import json |
| import uuid |
| import aiohttp |
| from fastapi import FastAPI, HTTPException, Request |
| from fastapi.responses import StreamingResponse, JSONResponse |
| from curl_cffi.requests import Session |
|
|
|
|
| |
| try: |
| import orjson as _jsonlib |
| def _loads(b: bytes): |
| return _jsonlib.loads(b) |
| def _dumps(obj) -> str: |
| |
| return _jsonlib.dumps(obj).decode("utf-8") |
| except Exception: |
| import json as _jsonlib |
| def _loads(b: bytes): |
| return _jsonlib.loads(b) |
| def _dumps(obj) -> str: |
| return _jsonlib.dumps(obj) |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger("chat-server-sse") |
|
|
| |
| Requests = Session(impersonate="chrome110") |
|
|
| app = FastAPI() |
|
|
|
|
| M2 = [ |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3.1-70b-instruct", |
| "max_tokens" : 8192 |
| }, |
|
|
| { |
| "tag": "@cf", |
| "model": "qwen/qwen2.5-coder-32b-instruct", |
| "max_tokens" : 8192 |
| }, |
| { |
| "tag": "@cf", |
| "model": "deepseek-ai/deepseek-r1-distill-qwen-32b", |
| "max_tokens" : 40960 |
| |
|
|
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-4-scout-17b-16e-instruct", |
| "max_tokens" : 40960 |
| |
|
|
| }, |
| { |
| "tag": "@cf", |
| "model": "google/gemma-3-12b-it", |
| "max_tokens" : 40960 |
| |
|
|
| }, |
| { |
| "tag": "@cf", |
| "model": "mistralai/mistral-small-3.1-24b-instruct", |
| "max_tokens" : 40960 |
| |
|
|
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3.3-70b-instruct-fp8-fast", |
| "max_tokens" : 8192 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3.2-3b-instruct", |
| "max_tokens" : 40960 |
| |
|
|
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3.2-1b-instruct", |
| "max_tokens" : 40960 |
| |
| }, |
| { |
| "tag": "@hf", |
| "model": "meta-llama/meta-llama-3-8b-instruct", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3-8b-instruct", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-2-7b-chat-int8", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-2-7b-chat-fp16", |
| "max_tokens" : None |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3-8b-instruct-awq", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@hf", |
| "model": "meta-llama/meta-llama-3-8b-instruct", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3-8b-instruct", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-2-7b-chat-int8", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@cf", |
| "model": "meta/llama-3-8b-instruct-awq", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@hf", |
| "model": "google/gemma-7b-it", |
| "max_tokens" : None |
| }, |
| { |
| "tag": "@cf", |
| "model": "google/gemma-2b-it-lora", |
| "max_tokens" : 4391 |
| }, |
| { |
| "tag": "@hf", |
| "model": "mistral/mistral-7b-instruct-v0.2", |
| "max_tokens" : 8192 |
| }, |
| { |
| "tag": "@cf", |
| "model": "mistral/mistral-7b-instruct-v0.2-lora", |
| "max_tokens" : 8192 |
| } |
| ] |
|
|
| def FREEGPT( |
| RQ : Any, |
| messages : List[Dict], |
| model : str = "deepseek-ai/deepseek-r1-distill-qwen-32b", |
| max_token : int = 40960, |
| stream : bool = True, |
| timeout: Optional[float] = None |
| ): |
| md = next((item["tag"] + "/" + item["model"] for item in M2 if item["model"] == model), "@cf/meta/llama-3.2-1b-instruct") |
|
|
| URL = f"https://llmchat.in/inference/stream?model={md}" |
|
|
| |
| headers = { |
| "Accept": "text/event-stream,*/*", |
| "Content-Type": "application/json", |
| "Origin": "https://llmchat.in", |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36", |
| "Cache-Control": "no-cache", |
| "Accept-Encoding": "identity", |
| "cf-ray" : "9cba9edd9f909aaf-SIN", |
|
|
| } |
|
|
|
|
| payload = { |
| "messages": messages, |
| "stream": stream, |
| **({"max_tokens": max_token} if max_token is not None else {}), |
| **({"max_tokens": next((item["max_tokens"] for item in M2 if item["model"] == model and item["max_tokens"] is not None), None)} if next((True for item in M2 if item["model"] == model and item["max_tokens"] is not None), None) else {}) |
| } |
|
|
|
|
| |
|
|
| try: |
| RESP = RQ.post(url=URL,json=payload , headers=headers , timeout=timeout,stream=stream) |
| print(RESP.status_code) |
| except: |
| return |
| if RESP.status_code == 200: |
| for raw in RESP.iter_lines(): |
| if not raw: |
| continue |
|
|
| try: |
| line = raw.decode("utf-8", errors="replace").strip() |
| except Exception: |
| line = raw.decode("latin-1", errors="replace").strip() |
|
|
| if line.startswith("data:"): |
| data_json = line.split('data: ')[1] |
| try: |
| data = json.loads(data_json) |
| except: |
| continue |
|
|
| try: |
| yield data["response"] |
| except: pass |
| |
| else: |
| print(RESP.status_code) |
|
|
|
|
|
|
| class CONV: |
|
|
| def __init__(self, default_system: str = ""): |
| self.default_system = default_system |
|
|
| @staticmethod |
| def _make_id() -> str: |
| return uuid.uuid4().hex[:20] |
|
|
| def alpaca_to_msg( |
| self, |
| alpaca_obj: Dict[str, Any], |
| insert_system: bool = True, |
| system_override: Optional[str] = None, |
| skip_empty: bool = True, |
| ) -> Tuple[List[Dict[str, str]], float]: |
|
|
| t0 = time.perf_counter() |
|
|
| out: List[Dict[str, str]] = [] |
| sys_text = system_override if system_override is not None else self.default_system |
| if insert_system and sys_text is not None: |
| out.append({"role": "system", "content": sys_text}) |
|
|
| msgs = alpaca_obj |
| append = out.append |
| for m in msgs: |
| role = (m.get("role") or "").strip().lower() |
| if role not in ("user", "assistant", "system"): |
| role = "user" |
|
|
| parts = m.get("parts") or [] |
| |
| texts: List[str] = [] |
| for p in parts: |
| |
| if isinstance(p, dict) and p.get("type") == "text": |
| txt = p.get("text", "") |
| if isinstance(txt, str) and txt: |
| |
| texts.append(txt.rstrip()) |
|
|
| if not texts and skip_empty: |
| continue |
|
|
| if texts: |
| content = "\n\n".join(texts) |
| append({"role": role, "content": content}) |
| else: |
| |
| append({"role": role, "content": ""}) |
|
|
| elapsed = time.perf_counter() - t0 |
| return out, elapsed |
|
|
| def msg_to_alpaca( |
| self, |
| msg_list: List[Dict[str, Any]], |
| include_step_start: bool = True, |
| assistant_state_done: bool = True, |
| preserve_ids: bool = False, |
| skip_empty_text_parts: bool = False, |
| ) -> Tuple[Dict[str, List[Dict[str, Any]]], float]: |
|
|
| t0 = time.perf_counter() |
|
|
| out_messages: List[Dict[str, Any]] = [] |
| append = out_messages.append |
|
|
| for entry in msg_list: |
| |
| if not isinstance(entry, dict): |
| role = "user" |
| content = str(entry) |
| entry_id = None |
| else: |
| role = (entry.get("role") or "user").strip().lower() |
| content = entry.get("content", "") |
| entry_id = entry.get("id") if preserve_ids else None |
|
|
| if role not in ("user", "assistant"): |
| role = "user" |
|
|
| parts: List[Dict[str, Any]] = [] |
| if role == "assistant" and include_step_start: |
| parts.append({"type": "step-start"}) |
|
|
| |
| if isinstance(content, str): |
| if not skip_empty_text_parts or content.strip() != "": |
| text_part: Dict[str, Any] = {"type": "text", "text": content} |
| if role == "assistant" and assistant_state_done: |
| text_part["state"] = "done" |
| parts.append(text_part) |
|
|
| |
| msg_obj: Dict[str, Any] = { |
| "id": entry_id if (entry_id is not None and isinstance(entry_id, str) and entry_id != "") else self._make_id(), |
| "role": role, |
| "parts": parts, |
| "metadata": {"custom": {}}, |
| } |
|
|
| append(msg_obj) |
|
|
| elapsed = time.perf_counter() - t0 |
| return out_messages, elapsed |
|
|
|
|
| M1=[ |
| "zai-org/glm-4.6", |
| "openai/gpt-5-nano-2025-08-07", |
| "deepseek-ai/deepseek-v3.2-thinking", |
| "nvidia/nvidia-nemotron-3-nano-30b-a3b", |
| "nvidia/nvidia-nemotron-3-nano-30b-a3b-thinking", |
| "openai/gpt-5-mini-2025-08-07", |
| "qwen/qwen3-vl-235b-a22b-thinking", |
| "qwen/qwen3-vl-235b-a22b-instruct", |
| "perplexity/sonar", |
| "moonshotai/kimi-k2.5", |
| "anthropic/claude-haiku-4-5-20251001", |
| "google/gemini-2.5-flash-lite", |
| "moonshotai/kimi-k2-thinking" |
| "mistralai/devstral-2-123b-instruct-2512" |
| "mistralai/mistral-large-3-675b-instruct-2512", |
| "openai/gpt-oss-safeguard-20b", |
| "openai/gpt-oss-120b" |
| |
| ] |
|
|
|
|
| def Adarsh_Personal( |
| RQ : Any, |
| messages : List[Dict], |
| model : str = "deepseek-ai/deepseek-r1-distill-qwen-32b", |
| max_token : int = 40960, |
| stream : bool = True, |
| timeout: Optional[float] = None |
| ): |
| |
| RES=False |
| URL = "https://hadadxyz-ai.hf.space/api/mz1a85y5n80zy5127hgsba5f3a9c2d1Np0x300vcgduqxb7ep084fygd016c9a2d16fa8b3c41gut432pvjctr75hhspjae25d6f7a8b9c0d1e2pjf43v16f3a4b5c6dd7e8fba2bdx9a0b6dv1c2d7e2b4c9f83d6a4f1bb6c152f9pe3c7a88qv5d91f3c2b765g134bp9a41ne4yx4b3vda8w074" |
|
|
|
|
| NEW_MSGS , S = CONV().msg_to_alpaca(messages, include_step_start=True, assistant_state_done=True) |
|
|
| |
|
|
| payload = { |
| "tools": {}, |
| "modelId": model, |
| "sessionId": "sess_7ef524b9_mlfe4ped", |
| "clientId": "7ef524b98a963b507ec9f4000fdea38c-mlfe4pea", |
| "requestId": "req_7ef524b9_mlfg1cpq_jjxb7p", |
| "clientIp": "122.161.52.54", |
| "realIp": "122.161.52.54", |
| "forwardedFor": "122.161.52.54", |
| "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36", |
| "id": "DEFAULT_THREAD_ID", |
| "messages": NEW_MSGS, |
| "trigger": "submit-message", |
| "metadata": {} |
| } |
|
|
| headers = { |
| "Accept": "text/event-stream, */*", |
| "Content-Type": "application/json", |
| "Origin": "https://hadadxyz-ai.hf.space", |
| "User-Agent": payload["userAgent"], |
| "Cache-Control": "no-cache", |
| "Accept-Encoding": "identity", |
| "x-turnstile-token": "mlfe5357-zq9depfzhpb-e18cbvzrpid", |
| "x-turnstile-verified": "true", |
| } |
|
|
|
|
| RESP = RQ.post(URL, json=payload, headers=headers, stream=stream, timeout=timeout) |
|
|
| if RESP.status_code == 200: |
| for raw in RESP.iter_lines(): |
| if not raw: |
| continue |
|
|
| try: |
| line = raw.decode("utf-8", errors="replace").strip() |
| except Exception: |
| line = raw.decode("latin-1", errors="replace").strip() |
|
|
| if line.startswith("data:"): |
| data_json = line.split('data: ')[1] |
| try: |
| data = json.loads(data_json) |
| except: |
| continue |
| try: |
| if data['type']=="reasoning-delta": |
| if not RES: |
| RES = True |
| yield "<think>\n" |
| try: |
| yield data["delta"] |
| except: |
| pass |
| except : |
| pass |
| try: |
| if data["type"]=="text-delta": |
| if RES: |
| RES = False |
| yield "\n</think>\n" |
|
|
| try: |
| yield data["delta"] |
| except: |
| pass |
| except: |
| pass |
| |
| M3 = ["qwen3-4b-thinking-2507"] |
|
|
| def QWEN( |
| RQ : Any, |
| messages : List[Dict], |
| model : str = "NONE", |
| max_token : int = 40960, |
| stream : bool = True, |
| timeout: Optional[float] = None |
| ): |
|
|
| def GEN(RQ:any,messages:list,timeout:int=None): |
| API_URL = "https://teichai-qwen3-4b-thinking-2507-claude-4-5-opus.hf.space/api/chat" |
|
|
| payload = { |
| "messages":messages, |
| "searchEnabled":False |
| } |
| |
| headers = {"Accept": "*/*","Content-Type": "application/json","Origin": "https://teichai-qwen3-4b-thinking-2507-claude-4-5-opus.hf.space","Referer": "https://teichai-qwen3-4b-thinking-2507-claude-4-5-opus.hf.space/","User-Agent": "python-requests/2.x"} |
|
|
| |
| RESPO = RQ.post(API_URL, headers=headers, json=payload, stream=stream, timeout=timeout) |
| |
| |
| buffer_lines = [] |
| for raw in RESPO.iter_lines(): |
| if raw is None: |
| continue |
| try: |
| line = raw.decode("utf-8", errors="replace").strip() |
| except Exception: |
| line = raw.decode("latin-1", errors="replace").strip() |
|
|
| if line == "": |
| if not buffer_lines: |
| continue |
| data_text = "".join(buffer_lines) |
| buffer_lines = [] |
| if data_text == "[DONE]": |
| break |
| try: |
| obj = json.loads(data_text) |
| try: |
| yield obj |
| except: |
| pass |
| except json.JSONDecodeError: |
| pass |
| continue |
|
|
| if line.startswith("data:"): |
| buffer_lines.append(line[len("data:"):].lstrip()) |
|
|
| RES = False |
| for i in GEN(RQ=RQ,messages=messages,timeout=timeout): |
| if i["type"]=="reasoning": |
| if not RES: |
| RES = True |
| yield "<think>\n" |
| yield i["content"] |
|
|
| else: |
| if RES: |
| RES = False |
| yield "\n</think>\n\n" |
| try: |
| yield i["content"] |
| except: |
| pass |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| PROVIDERS: Dict[str, Dict[str, Any]] = { |
| "1": {"__func__": Adarsh_Personal, "models": M1}, |
| "2": {"__func__": QWEN, "models": M2}, |
| "3": {"__func__": FREEGPT, "models": M3}, |
| } |
|
|
| |
| PROVIDER_META: Dict[str, Dict[str, Any]] = {} |
|
|
| class Config: |
| DEFAULT_PROVIDER = "1" |
| DEFAULT_MODEL = "llama-3.3-70b-versatile" |
| DEFAULT_MAX_TOKENS = 512 |
| DEFAULT_TEMPERATURE = 0.7 |
| TIMEOUT = 30.0 |
| STREAM = True |
|
|
|
|
| @dataclass |
| class ChatRequest: |
| api_key: str |
| messages: List[Dict[str, Any]] |
| model: Optional[str] = None |
| provider: str = Config.DEFAULT_PROVIDER |
| max_tokens: int = Config.DEFAULT_MAX_TOKENS |
| temperature: float = Config.DEFAULT_TEMPERATURE |
| stream: bool = Config.STREAM |
|
|
| @staticmethod |
| def from_dict(payload: Dict[str, Any]) -> "ChatRequest": |
| api_key = payload.get("api_key") or payload.get("key") or payload.get("apikey") |
| messages = payload.get("messages") or payload.get("message") or payload.get("msgs") |
| model = payload.get("model_name") or payload.get("model") |
| provider = payload.get("provider") or Config.DEFAULT_PROVIDER |
| provider = str(provider) |
| max_tokens = payload.get("max_tokens", Config.DEFAULT_MAX_TOKENS) |
| temperature = payload.get("temperature", Config.DEFAULT_TEMPERATURE) |
| stream = payload.get("stream", Config.STREAM) |
| if messages is None: |
| messages = [] |
| if isinstance(messages, dict): |
| messages = [messages] |
| return ChatRequest( |
| api_key=api_key, |
| messages=messages, |
| model=model, |
| provider=provider, |
| max_tokens=max_tokens, |
| temperature=temperature, |
| stream=stream, |
| ) |
|
|
|
|
| GLOBAL_AIOHTTP: Optional[aiohttp.ClientSession] = None |
|
|
|
|
| @app.on_event("startup") |
| async def on_startup(): |
| global GLOBAL_AIOHTTP, PROVIDER_META |
| logger.info("startup: create global aiohttp session and analyze providers") |
| GLOBAL_AIOHTTP = aiohttp.ClientSession() |
| for key, payload in PROVIDERS.items(): |
| func = payload["__func__"] |
| PROVIDER_META[key] = { |
| "func": func, |
| "is_async_gen_fn": inspect.isasyncgenfunction(func), |
| "is_coroutine_fn": inspect.iscoroutinefunction(func), |
| "is_generator_fn": inspect.isgeneratorfunction(func), |
| "is_sync_fn": not (inspect.iscoroutinefunction(func) or inspect.isasyncgenfunction(func) or inspect.isgeneratorfunction(func)), |
| } |
| logger.info("provider meta ready: %s", {k: {kk: vv for kk, vv in v.items() if kk != "func"} for k, v in PROVIDER_META.items()}) |
|
|
|
|
| @app.on_event("shutdown") |
| async def on_shutdown(): |
| global GLOBAL_AIOHTTP |
| logger.info("shutdown: close global aiohttp session") |
| if GLOBAL_AIOHTTP and not GLOBAL_AIOHTTP.closed: |
| await GLOBAL_AIOHTTP.close() |
|
|
|
|
| async def _stream_sync_generator_in_thread(func, *args, **kwargs) -> AsyncGenerator[bytes, None]: |
| """ |
| Run a sync generator in a thread and stream items back via an asyncio.Queue. |
| This allows streaming without blocking the event loop. |
| """ |
| loop = asyncio.get_running_loop() |
| q: asyncio.Queue = asyncio.Queue(maxsize=32) |
| sentinel = object() |
|
|
| def worker(): |
| try: |
| gen = func(*args, **kwargs) |
| |
| if gen is None: |
| loop.call_soon_threadsafe(q.put_nowait, sentinel) |
| return |
| |
| for item in gen: |
| loop.call_soon_threadsafe(q.put_nowait, item) |
| except Exception as e: |
| |
| loop.call_soon_threadsafe(q.put_nowait, e) |
| finally: |
| loop.call_soon_threadsafe(q.put_nowait, sentinel) |
|
|
| |
| thread_task = loop.run_in_executor(None, worker) |
|
|
| |
| while True: |
| item = await q.get() |
| if item is sentinel: |
| break |
| if isinstance(item, Exception): |
| |
| raise item |
| if item is None: |
| continue |
| if isinstance(item, bytes): |
| yield item |
| elif isinstance(item, str): |
| yield item.encode("utf-8") |
| else: |
| yield str(item).encode("utf-8") |
|
|
| |
| await asyncio.shield(thread_task) |
|
|
|
|
| async def _call_provider_and_stream( |
| provider_key: str, |
| messages: List[Dict], |
| model: str, |
| max_token: int, |
| stream_flag: bool, |
| timeout: float, |
| ) -> AsyncGenerator[bytes, None]: |
| """ |
| Core streaming logic. Yields raw bytes as soon as provider yields items. |
| """ |
| if provider_key not in PROVIDER_META: |
| raise ValueError(f"Unknown provider '{provider_key}'") |
|
|
| meta = PROVIDER_META[provider_key] |
| func = meta["func"] |
|
|
| |
| kwargs = dict(messages=messages, model=model, max_token=max_token, stream=stream_flag, timeout=timeout) |
|
|
| try: |
| |
| if meta["is_async_gen_fn"]: |
| agen = func(Requests, **kwargs) |
| |
| async for item in agen: |
| if item is None: |
| continue |
| if isinstance(item, bytes): |
| yield item |
| elif isinstance(item, str): |
| yield item.encode("utf-8") |
| else: |
| yield str(item).encode("utf-8") |
| return |
|
|
| |
| if meta["is_generator_fn"]: |
| |
| async for item in _stream_sync_generator_in_thread(lambda *a, **k: func(Requests, **kwargs)): |
| yield item |
| return |
|
|
| |
| if meta["is_coroutine_fn"]: |
| |
| res = await asyncio.wait_for(func(Requests, **kwargs), timeout=timeout) |
| if res is None: |
| return |
| |
| if inspect.isasyncgen(res): |
| async for item in res: |
| if item is None: |
| continue |
| if isinstance(item, bytes): |
| yield item |
| elif isinstance(item, str): |
| yield item.encode("utf-8") |
| else: |
| yield str(item).encode("utf-8") |
| return |
| |
| if inspect.isgenerator(res) or (hasattr(res, "__iter__") and not isinstance(res, (str, bytes, dict))): |
| for item in res: |
| if item is None: |
| continue |
| if isinstance(item, bytes): |
| yield item |
| elif isinstance(item, str): |
| yield item.encode("utf-8") |
| else: |
| yield str(item).encode("utf-8") |
| return |
| |
| if isinstance(res, bytes): |
| yield res |
| elif isinstance(res, str): |
| yield res.encode("utf-8") |
| else: |
| yield str(res).encode("utf-8") |
| return |
|
|
| |
| |
| def sync_call_wrapper(): |
| return func(Requests, **kwargs) |
|
|
| sync_res = await asyncio.wait_for(asyncio.to_thread(sync_call_wrapper), timeout=timeout) |
| if sync_res is None: |
| return |
| if inspect.isgenerator(sync_res) or (hasattr(sync_res, "__iter__") and not isinstance(sync_res, (str, bytes, dict))): |
| for item in sync_res: |
| if item is None: |
| continue |
| if isinstance(item, bytes): |
| yield item |
| elif isinstance(item, str): |
| yield item.encode("utf-8") |
| else: |
| yield str(item).encode("utf-8") |
| return |
| if isinstance(sync_res, bytes): |
| yield sync_res |
| elif isinstance(sync_res, str): |
| yield sync_res.encode("utf-8") |
| else: |
| yield str(sync_res).encode("utf-8") |
|
|
| except asyncio.TimeoutError: |
| err = f"[server_timeout] provider {provider_key} exceeded {timeout}s\n" |
| logger.warning(err.strip()) |
| yield err.encode("utf-8") |
| except Exception as e: |
| logger.exception("provider error") |
| err = f"[server_error] {type(e).__name__}: {e}\n" |
| yield err.encode("utf-8") |
|
|
|
|
| @app.post("/chat") |
| async def chat_endpoint(request: Request): |
| |
| try: |
| body_bytes = await request.body() |
| payload = _loads(body_bytes) |
| except Exception as e: |
| raise HTTPException(status_code=400, detail=f"invalid json: {e}") |
|
|
| req = ChatRequest.from_dict(payload) |
| if not req.api_key or not req.messages: |
| raise HTTPException(status_code=400, detail="api_key and messages required") |
|
|
| provider_key = req.provider |
|
|
| if req.stream: |
| async def sse_stream(): |
| |
| async for raw_chunk in _call_provider_and_stream( |
| provider_key=provider_key, |
| messages=req.messages, |
| model=req.model or Config.DEFAULT_MODEL, |
| max_token=req.max_tokens, |
| stream_flag=req.stream, |
| timeout=Config.TIMEOUT, |
| ): |
| |
| text = raw_chunk.decode("utf-8", errors="ignore") if isinstance(raw_chunk, (bytes, bytearray)) else str(raw_chunk) |
| |
| payload_obj = {"response": text} |
| try: |
| json_str = _dumps(payload_obj) |
| except Exception: |
| |
| import json as _fallback_json |
| json_str = _fallback_json.dumps(payload_obj) |
| |
| sse_event = f"data: {json_str}\n\n" |
| yield sse_event.encode("utf-8") |
| |
| yield ("[DONE]\n").encode("utf-8") |
|
|
| return StreamingResponse(sse_stream(), media_type="text/event-stream") |
|
|
| else: |
| |
| collected = [] |
| async for chunk in _call_provider_and_stream( |
| provider_key=provider_key, |
| messages=req.messages, |
| model=req.model or Config.DEFAULT_MODEL, |
| max_token=req.max_tokens, |
| stream_flag=req.stream, |
| timeout=Config.TIMEOUT, |
| ): |
| collected.append(chunk.decode("utf-8", errors="ignore") if isinstance(chunk, (bytes, bytearray)) else str(chunk)) |
| return JSONResponse({"text": "".join(collected)}) |
|
|
|
|
| @app.get("/model") |
| async def model(): |
| return {"models": [M1, M2, M3]} |
|
|
|
|
| @app.get("/health") |
| async def health_check(): |
| return {"status": "ok"} |
|
|