Spaces:

scikit-plots
/

ai

Running

App Files Files Community

ai / app.py

celik-muhammed

Upload 5 files

56647d5 verified about 9 hours ago

raw

history blame contribute delete

49.3 kB

	# scikit-plots/ai · _hf_spaces_proxy/app.py v6.0.0
	#
	# Thin OpenAI-compatible reverse proxy for sphinx-ai-assistant.
	#
	# THREE-PATH ROUTING (evaluated in order)
	# ─────────────────────────────────────────
	# Path 1 — BACKEND_URL set (explicit custom backend override)
	# Forward verbatim to BACKEND_URL.
	# HF_TOKEN injected as Bearer token when also set.
	# Read timeout: PROXY_TIMEOUT (default 600 s).
	#
	# Path 2 — Model namespace in HF_SPACES_MODEL_NAMESPACES
	# Model owner matches a custom namespace (default: "scikit-plots").
	# Forward to HF_SPACES_MODEL_URL (the scikit-plots/ai-model HF Space).
	# CPU inference on a 7B model takes 4-5 minutes.
	# Read timeout: PATH2_TIMEOUT (default 600 s).
	#
	# Path 3 — HF Serverless Inference API (default fallback)
	# Model has a registered HF Inference Provider (openai/, Qwen/, etc.).
	# Build {HF_BASE}/{model}/v1/chat/completions and inject HF_TOKEN.
	# Read timeout: PATH3_TIMEOUT (default 120 s).
	#
	# WHY PER-PATH TIMEOUTS MATTER (root cause of the "network error" in v4)
	# ─────────────────────────────────────────────────────────────────────────
	# v4.0.0 used a single PROXY_TIMEOUT=120 s applied to ALL paths.
	# The ai-model Space runs a 7B model on CPU basic hardware. Cold-start
	# inference requires ~50 s tokenizer load + ~50 s model load + ~4.5 min
	# generation. Every request to the ai-model Space timed out at 120 s and
	# the browser reported "Sorry, something went wrong: network error".
	# v5.0.0 fixes this by:
	# 1. Raising DEFAULT_PROXY_TIMEOUT from 120 to 600 s.
	# 2. Adding per-path timeouts so Path 3 (fast GPU) stays at 120 s
	# while Path 2 (slow CPU) gets the full 600 s.
	# 3. Using httpx per-request timeouts so a single shared client
	# serves both fast and slow paths without interference.
	#
	# ENVIRONMENT VARIABLES (Space → Settings → Repository secrets)
	# ─────────────────────────────────────────────────────────────
	# HF_TOKEN Required for Path 3; optional for Path 2.
	# HF_SPACES_MODEL_URL Path 2 destination URL.
	# Default: https://scikit-plots-ai-model.hf.space/v1/chat/completions
	# HF_SPACES_MODEL_NAMESPACES Comma-separated owner namespaces for Path 2.
	# Default: scikit-plots
	# BACKEND_URL Path 1 override (all requests go here when set).
	# HF_BASE HF Serverless API base URL.
	# Default: https://router.huggingface.co
	# DEFAULT_MODEL Fallback model when request body omits "model".
	# Default: scikit-plots/Qwen2.5-Coder-32B-Instruct
	# PROXY_TIMEOUT Path 1 read timeout in seconds. Default: 600.
	# PATH2_TIMEOUT Path 2 read timeout in seconds. Default: 600.
	# PATH3_TIMEOUT Path 3 read timeout in seconds. Default: 120.
	# PROXY_CONNECT_TIMEOUT TCP handshake timeout. Default: 10.
	# PROXY_WRITE_TIMEOUT Request body upload timeout. Default: 30.
	# PROXY_POOL_TIMEOUT Connection pool acquire timeout. Default: 10.
	# ALLOWED_ORIGINS Comma-separated CORS origins. Default: *.
	# MAX_BODY_BYTES Maximum accepted body size. Default: 10485760.
	#
	# Authors: The scikit-plots developers
	# SPDX-License-Identifier: BSD-3-Clause

	"""
	FastAPI reverse proxy for sphinx-ai-assistant (scikit-plots/ai HF Space).

	Routes browser POST requests through three ordered paths with independent
	per-path read timeouts:

	* Path 1 — ``BACKEND_URL`` set: explicit custom backend.
	* Path 2 — Model namespace in ``HF_SPACES_MODEL_NAMESPACES``:
	forward to ``HF_SPACES_MODEL_URL`` (the ``scikit-plots/ai-model`` Space,
	CPU inference — read timeout 600 s by default).
	* Path 3 — Default: HF Serverless Inference API (GPU, read timeout 120 s).

	Notes
	-----
	Developer note — per-path timeouts
	``_resolve_upstream_url`` returns ``(url, headers, read_timeout_s)``.
	``_forward`` builds an ``httpx.Timeout`` from read_timeout_s and
	the shared connect/write/pool values, then passes it per-request
	so the shared client never imposes a global ceiling. This means
	concurrent slow (Path 2) and fast (Path 3) requests never block each
	other.

	Developer note — shared HTTP client
	A single :class:`httpx.AsyncClient` is created during lifespan and
	shared across all requests. It is created with ``timeout=None`` so
	all timeout control lives in each request's own ``httpx.Timeout``
	object. Streaming uses ``client.stream()`` which closes the response
	body (not the client) on context exit, so concurrent SSE requests are
	safe.

	Developer note — explicit error handling
	``_forward`` catches ``httpx.ReadTimeout``, ``httpx.ConnectTimeout``,
	and ``httpx.RequestError`` individually and returns meaningful JSON
	errors with appropriate HTTP status codes so the browser widget can
	display a useful message instead of a generic "network error".
	"""

	from __future__ import annotations

	import asyncio
	import json
	import logging
	import os
	import uuid
	from collections.abc import AsyncGenerator
	from contextlib import asynccontextmanager
	from typing import Any

	import httpx
	from fastapi import Depends, FastAPI, HTTPException, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse, Response, StreamingResponse

	# _shared_logic.py must live alongside this file.
	try:
	from _shared_logic import ( # type: ignore[import]
	DEFAULT_HF_BASE,
	DEFAULT_HF_SPACES_MODEL_NAMESPACES,
	DEFAULT_HF_SPACES_MODEL_URL,
	DEFAULT_MAX_BODY_BYTES,
	DEFAULT_MODEL,
	DEFAULT_PATH2_READ_TIMEOUT,
	DEFAULT_PATH3_READ_TIMEOUT,
	DEFAULT_PROXY_TIMEOUT,
	PROXY_VERSION,
	_resolve_upstream_url,
	_safe_float,
	_safe_int,
	_token_log_fragment,
	_validate_env,
	)
	except ImportError:
	from .._shared_logic import ( # type: ignore[import]
	DEFAULT_HF_BASE,
	DEFAULT_HF_SPACES_MODEL_NAMESPACES,
	DEFAULT_HF_SPACES_MODEL_URL,
	DEFAULT_MAX_BODY_BYTES,
	DEFAULT_MODEL,
	DEFAULT_PATH2_READ_TIMEOUT,
	DEFAULT_PATH3_READ_TIMEOUT,
	DEFAULT_PROXY_TIMEOUT,
	PROXY_VERSION,
	_resolve_upstream_url,
	_safe_float,
	_safe_int,
	_token_log_fragment,
	_validate_env,
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# Helpers — placed before configuration so they are available at module scope
	# ─────────────────────────────────────────────────────────────────────────────

	def _client_ip(request: Request) -> str:
	"""Extract the real client IP from the request.

	Parameters
	----------
	request : fastapi.Request
	Incoming HTTP request.

	Returns
	-------
	str
	Best-effort client IP string; ``"unknown"`` when unavailable.

	Notes
	-----
	Developer: HF Spaces sits behind a proxy so ``request.client.host``
	is the proxy IP, not the user IP. ``X-Forwarded-For`` is the correct
	source — take the FIRST value only (leftmost = original client;
	rightmost values can be spoofed by intermediaries).
	"""
	xff = request.headers.get("x-forwarded-for", "")
	if xff:
	return xff.split(",")[0].strip()
	if request.client:
	return request.client.host or "unknown"
	return "unknown"


	# ─────────────────────────────────────────────────────────────────────────────
	# Logging
	# ─────────────────────────────────────────────────────────────────────────────

	class _StructuredFormatter(logging.Formatter):
	"""Emit one JSON object per log record to stdout.

	Parameters
	----------
	args, *kwargs
	Forwarded to :class:`logging.Formatter`.

	Notes
	-----
	Developer: JSON format is required for machine-parseable log ingestion
	(HF Spaces log export, Datadog, etc.). Text-format lines require regex
	in log queries; JSON fields are natively queryable.

	The ``exc_info`` key is omitted entirely when no exception is attached
	so log consumers do not need to handle a null field on every record.
	"""

	def format(self, record: logging.LogRecord) -> str: # noqa: A003
	payload: dict = {
	"ts": self.formatTime(record, datefmt="%Y-%m-%dT%H:%M:%S"),
	"level": record.levelname,
	"logger": record.name,
	"event": record.getMessage(),
	}
	if record.exc_info:
	payload["exc_info"] = self.formatException(record.exc_info)
	return json.dumps(payload, ensure_ascii=False)


	_handler = logging.StreamHandler()
	_handler.setFormatter(_StructuredFormatter())
	logging.root.handlers = [_handler]
	logging.root.setLevel(logging.INFO)
	logger = logging.getLogger(__name__)


	# ─────────────────────────────────────────────────────────────────────────────
	# Configuration — read once at module import, never at request time
	# ─────────────────────────────────────────────────────────────────────────────

	#: Explicit custom backend URL (Path 1).
	BACKEND_URL: str = os.environ.get("BACKEND_URL", "").strip()

	#: HuggingFace API token. Required for Path 3; optional for Path 2.
	HF_TOKEN: str = os.environ.get("HF_TOKEN", "").strip()

	#: HF Serverless Inference API base URL (no trailing slash).
	HF_BASE: str = os.environ.get("HF_BASE", DEFAULT_HF_BASE).rstrip("/")

	#: Fallback model when request body omits ``model``.
	DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", DEFAULT_MODEL).strip() or DEFAULT_MODEL

	#: Path 2 destination URL — the custom ai-model HF Space.
	HF_SPACES_MODEL_URL: str = os.environ.get(
	"HF_SPACES_MODEL_URL", DEFAULT_HF_SPACES_MODEL_URL
	).strip()

	#: Parsed model owner namespaces routed to HF_SPACES_MODEL_URL (Path 2).
	_raw_namespaces: str = os.environ.get(
	"HF_SPACES_MODEL_NAMESPACES",
	",".join(DEFAULT_HF_SPACES_MODEL_NAMESPACES),
	)
	HF_SPACES_MODEL_NAMESPACES: tuple[str, ...] = (
	tuple(ns.strip() for ns in _raw_namespaces.split(",") if ns.strip())
	or DEFAULT_HF_SPACES_MODEL_NAMESPACES
	)

	#: Maximum accepted request body size (bytes).
	MAX_BODY_BYTES: int = _safe_int(
	os.environ.get("MAX_BODY_BYTES"),
	DEFAULT_MAX_BODY_BYTES,
	)

	# ── Per-path read timeouts ────────────────────────────────────────────────────
	#: Path 1 (BACKEND_URL) read timeout in seconds.
	_proxy_timeout_secs: float = float(
	_safe_int(
	os.environ.get("PROXY_TIMEOUT"),
	DEFAULT_PROXY_TIMEOUT,
	)
	)

	#: Path 2 (ai-model Space, CPU inference) read timeout in seconds.
	#: Default 600 s — covers 4-5 min CPU inference with 1 min headroom.
	_path2_timeout_secs: float = _safe_float(
	os.environ.get("PATH2_TIMEOUT"),
	DEFAULT_PATH2_READ_TIMEOUT,
	)

	#: Path 3 (HF Serverless API, GPU) read timeout in seconds.
	#: Default 120 s — generous margin for GPU-backed inference (30-90 s typical).
	_path3_timeout_secs: float = _safe_float(
	os.environ.get("PATH3_TIMEOUT"),
	DEFAULT_PATH3_READ_TIMEOUT,
	)

	# ── Shared phase timeouts (apply to all paths) ────────────────────────────────
	#: TCP handshake timeout in seconds.
	_connect_timeout_secs: float = float(os.environ.get("PROXY_CONNECT_TIMEOUT", "10"))
	#: Request body upload timeout in seconds.
	_write_timeout_secs: float = float(os.environ.get("PROXY_WRITE_TIMEOUT", "30"))
	#: Connection pool acquire timeout in seconds.
	_pool_timeout_secs: float = float(os.environ.get("PROXY_POOL_TIMEOUT", "10"))


	# ─────────────────────────────────────────────────────────────────────────────
	# CORS
	# ─────────────────────────────────────────────────────────────────────────────

	_raw_origins: str = os.environ.get("ALLOWED_ORIGINS", "*").strip()
	_allowed_origins: list[str] = (
	["*"]
	if _raw_origins == "*"
	else [o.strip() for o in _raw_origins.split(",") if o.strip()]
	)

	#: HuggingFace Dataset repo for training contributions.
	#: Must be set if POST /v1/contribute is expected to succeed.
	TRAINING_DATASET_REPO: str = os.environ.get("TRAINING_DATASET_REPO", "").strip()

	#: Current consent version string. Must match the JS constant _TRAINING_CONSENT_VERSION.
	#: Increment when the consent UI text changes materially.
	TRAINING_CONSENT_VERSION: str = "v1.0"

	#: Maximum records per contribution POST.
	MAX_CONTRIBUTION_RECORDS: int = 100

	#: In-memory per-IP rate-limit store for contribution endpoint.
	#: Keys: IP string. Values: (count, window_start_timestamp).
	_contrib_rl: dict[str, tuple[int, float]] = {}
	_contrib_rl_lock = asyncio.Lock()

	#: In-memory per-IP rate-limit store for share endpoint.
	_share_rl: dict[str, tuple[int, float]] = {}
	_share_rl_lock = asyncio.Lock()

	#: In-memory conversation share store.
	#: Keys: UUID hex string. Values: share metadata dict including ``expiresAt_ts``.
	#: Ephemeral — clears on process restart. Stale entries are evicted lazily on write.
	_share_store: dict[str, dict] = {}
	_share_store_lock = asyncio.Lock()

	#: In-memory per-IP rate-limit store for feedback endpoint.
	_feedback_rl: dict[str, tuple[int, float]] = {}
	_feedback_rl_lock = asyncio.Lock()


	# ─────────────────────────────────────────────────────────────────────────────
	# Startup validation — fail fast with actionable messages
	# ─────────────────────────────────────────────────────────────────────────────

	_validate_env(BACKEND_URL, HF_TOKEN, HF_SPACES_MODEL_URL)

	if not BACKEND_URL and not HF_TOKEN:
	logger.warning(
	"HF_TOKEN is not set. Requests to standard HF Inference API models "
	"(e.g. openai/gpt-oss-20b, Qwen/*) will fail with 401 Unauthorized. "
	"Only models in namespaces %s will be served via %s.",
	list(HF_SPACES_MODEL_NAMESPACES),
	HF_SPACES_MODEL_URL or "<HF_SPACES_MODEL_URL not set>",
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# Shared HTTP client — lifecycle managed by FastAPI lifespan
	# ─────────────────────────────────────────────────────────────────────────────

	#: Module-level reference to the shared httpx client.
	#: Created with ``timeout=None`` so all timeout control is per-request.
	_http_client: httpx.AsyncClient \| None = None


	@asynccontextmanager
	async def _lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
	"""
	Create and close the shared HTTP client on application startup / shutdown.

	Parameters
	----------
	app : FastAPI
	The FastAPI application instance.

	Notes
	-----
	Developer note — The client is created with ``timeout=None`` so
	that every request supplies its own :class:`httpx.Timeout` object.
	This allows concurrent Path 2 requests (600 s) and Path 3 requests
	(120 s) to coexist on the same client without either blocking the other.
	"""
	global _http_client # noqa: PLW0603
	_http_client = httpx.AsyncClient()
	logger.info(
	"Proxy v%s started. HTTP client ready (timeout=per-request).",
	PROXY_VERSION,
	)
	logger.info(
	"Routing: backend_url=%r \| hf_spaces_model_url=%r \| "
	"hf_spaces_namespaces=%r \| hf_token=%s \| default_model=%r",
	BACKEND_URL or None,
	HF_SPACES_MODEL_URL or None,
	list(HF_SPACES_MODEL_NAMESPACES),
	_token_log_fragment(HF_TOKEN),
	DEFAULT_MODEL,
	)
	logger.info(
	"Timeouts (seconds): path1=%s \| path2=%s \| path3=%s \| "
	"connect=%s \| write=%s \| pool=%s",
	_proxy_timeout_secs,
	_path2_timeout_secs,
	_path3_timeout_secs,
	_connect_timeout_secs,
	_write_timeout_secs,
	_pool_timeout_secs,
	)
	try:
	yield
	finally:
	await _http_client.aclose()
	_http_client = None
	logger.info("Proxy shutdown. HTTP client closed.")


	# ─────────────────────────────────────────────────────────────────────────────
	# Application
	# ─────────────────────────────────────────────────────────────────────────────

	app = FastAPI(
	title="sphinx-ai-assistant proxy",
	description=(
	"Thin OpenAI-compatible reverse proxy for sphinx-ai-assistant. "
	"Routes to HF Serverless Inference API, a custom ai-model Space, "
	"or an explicit backend URL based on the model namespace."
	),
	version=PROXY_VERSION,
	lifespan=_lifespan,
	docs_url=None,
	redoc_url=None,
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=_allowed_origins,
	allow_methods=["GET", "POST", "OPTIONS"],
	# Authorization added for write endpoints (POST /v1/feedback, POST /v1/contribute)
	# that validate a Bearer token. Without this the browser preflight rejects
	# requests containing Authorization headers before the handler runs.
	allow_headers=["Content-Type", "Authorization"],
	allow_credentials=False,
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# Helpers
	# ─────────────────────────────────────────────────────────────────────────────


	def _resolve_url(body: bytes) -> tuple[str, dict[str, str], float]:
	"""
	Thin wrapper around :func:`_resolve_upstream_url`.

	Closes over module-level config globals so route handlers need not pass
	environment variables explicitly.

	Parameters
	----------
	body : bytes
	Raw JSON request body from the browser.

	Returns
	-------
	url : str
	Fully-qualified upstream endpoint URL.
	headers : dict[str, str]
	HTTP headers for the upstream POST request.
	read_timeout_s : float
	Per-path read timeout in seconds.

	See Also
	--------
	_shared_logic._resolve_upstream_url : Full three-path routing logic.
	"""
	return _resolve_upstream_url(
	body,
	backend_url=BACKEND_URL,
	hf_token=HF_TOKEN,
	hf_base=HF_BASE,
	default_model=DEFAULT_MODEL,
	hf_spaces_model_url=HF_SPACES_MODEL_URL,
	hf_spaces_model_namespaces=HF_SPACES_MODEL_NAMESPACES,
	proxy_timeout=_proxy_timeout_secs,
	path2_read_timeout=_path2_timeout_secs,
	path3_read_timeout=_path3_timeout_secs,
	)


	def _make_timeout(read_s: float) -> httpx.Timeout:
	"""
	Build a per-request :class:`httpx.Timeout` with the given read timeout.

	Parameters
	----------
	read_s : float
	Read timeout in seconds for this specific request.

	Returns
	-------
	httpx.Timeout
	Fully specified timeout with connect, read, write, and pool phases.

	Notes
	-----
	Developer note — connect, write, and pool timeouts are shared
	across all paths because they do not vary by inference speed. Only
	the read timeout varies: long (600 s) for CPU inference (Path 2),
	short (120 s) for GPU inference (Path 3).
	"""
	return httpx.Timeout(
	connect=_connect_timeout_secs,
	read=read_s,
	write=_write_timeout_secs,
	pool=_pool_timeout_secs,
	)


	async def _validated_body(request: Request) -> bytes:
	"""
	FastAPI dependency: read and validate the request body size.

	Parameters
	----------
	request : Request
	The incoming FastAPI request.

	Returns
	-------
	bytes
	The raw request body.

	Raises
	------
	HTTPException
	HTTP 413 when the body exceeds :data:`MAX_BODY_BYTES`.
	"""
	cl = _safe_int(request.headers.get("content-length"), -1)
	if cl > MAX_BODY_BYTES:
	raise HTTPException(
	status_code=413,
	detail=(
	f"Request body too large (Content-Length: {cl:,} bytes). "
	f"Maximum allowed: {MAX_BODY_BYTES:,} bytes."
	),
	)
	body: bytes = await request.body()
	if len(body) > MAX_BODY_BYTES:
	raise HTTPException(
	status_code=413,
	detail=(
	f"Request body too large ({len(body):,} bytes). "
	f"Maximum allowed: {MAX_BODY_BYTES:,} bytes."
	),
	)
	return body


	async def _forward(body: bytes) -> Response:
	"""
	Forward body to the resolved upstream and return the response.

	Handles both non-streaming (JSON) and streaming (SSE) responses
	transparently by detecting ``"stream": true`` in the request body.

	Per-path timeouts are applied at the individual request level so slow
	CPU (Path 2) and fast GPU (Path 3) requests do not interfere with each
	other on the shared HTTP client.

	Parameters
	----------
	body : bytes
	Raw JSON request body from the browser.

	Returns
	-------
	fastapi.Response
	Upstream response with original status code and content-type.
	SSE streaming is preserved via :class:`~fastapi.responses.StreamingResponse`.

	Notes
	-----
	Developer note — Error handling is explicit and specific:

	* ``httpx.ReadTimeout`` → HTTP 504 with actionable timeout message.
	* ``httpx.ConnectTimeout`` → HTTP 504 with connect-specific message.
	* ``httpx.RequestError`` → HTTP 502 Bad Gateway.

	These map to the correct HTTP semantics and allow the browser widget
	to display a useful message rather than a generic "network error".

	Developer note — SSE error events include a UUID so log aggregators
	can correlate browser-visible errors to specific upstream failure events.
	"""
	if _http_client is None:
	raise RuntimeError(
	"HTTP client is not initialised. "
	"FastAPI lifespan may not have started correctly."
	)

	url, headers, read_timeout_s = _resolve_url(body)
	req_timeout = _make_timeout(read_timeout_s)

	# Detect streaming intent before opening the upstream connection.
	stream_requested: bool = False
	try:
	payload: Any = json.loads(body)
	stream_requested = bool(payload.get("stream", False))
	except (json.JSONDecodeError, ValueError, AttributeError, TypeError):
	pass

	if stream_requested:

	async def _sse_chunks() -> AsyncGenerator[bytes, None]:
	"""Async generator proxying upstream SSE frames to the browser."""
	try:
	async with _http_client.stream( # type: ignore[union-attr]
	"POST", url, content=body, headers=headers, timeout=req_timeout
	) as upstream:
	if upstream.status_code != 200: # noqa: PLR2004
	err_body = await upstream.aread()
	error_payload = json.dumps(
	{
	"id": f"err-{uuid.uuid4().hex}",
	"error": {
	"status": upstream.status_code,
	"message": err_body.decode(errors="replace")[:500],
	},
	}
	)
	yield f"data: {error_payload}\n\n".encode()
	else:
	async for chunk in upstream.aiter_bytes():
	yield chunk

	except httpx.ReadTimeout:
	err_id = uuid.uuid4().hex
	logger.warning(
	"ReadTimeout after %.0f s on streaming request to %s [%s]",
	read_timeout_s,
	url,
	err_id,
	)
	yield f'data: {{"id":"err-{err_id}","error":{{"status":504,"message":'
	yield (
	f'"Upstream timed out after {read_timeout_s:.0f} s. '
	f"CPU inference can take 4-5 minutes. "
	f'If using the ai-model Space, the model may still be loading."}}}}\n\n'
	).encode()

	except httpx.ConnectTimeout:
	err_id = uuid.uuid4().hex
	logger.warning(
	"ConnectTimeout on streaming request to %s [%s]", url, err_id
	)
	yield (
	f'data: {{"id":"err-{err_id}","error":{{"status":504,"message":'
	f'"Connection timed out reaching {url}. '
	f'The HF Space may be starting up."}}}}\n\n'
	).encode()

	except httpx.RequestError as exc:
	err_id = uuid.uuid4().hex
	logger.warning(
	"RequestError on streaming request to %s: %s [%s]",
	url,
	exc,
	err_id,
	)
	yield (
	f'data: {{"id":"err-{err_id}","error":{{"status":502,"message":'
	f'"Failed to reach upstream: {type(exc).__name__}"}}}}\n\n'
	).encode()

	return StreamingResponse(
	_sse_chunks(),
	status_code=200,
	media_type="text/event-stream",
	headers={
	"Cache-Control": "no-cache",
	"X-Accel-Buffering": "no",
	},
	)

	# Non-streaming path: await the full upstream response.
	try:
	upstream = await _http_client.post(
	url, content=body, headers=headers, timeout=req_timeout
	)
	except httpx.ReadTimeout:
	logger.warning(
	"ReadTimeout after %.0f s on non-streaming request to %s",
	read_timeout_s,
	url,
	)
	return JSONResponse(
	status_code=504,
	content={
	"error": {
	"type": "timeout_error",
	"message": (
	f"Upstream timed out after {read_timeout_s:.0f} s. "
	"CPU inference on the ai-model Space can take 4-5 minutes. "
	"The model may still be loading — retry in a few minutes."
	),
	}
	},
	)
	except httpx.ConnectTimeout:
	logger.warning("ConnectTimeout on non-streaming request to %s", url)
	return JSONResponse(
	status_code=504,
	content={
	"error": {
	"type": "timeout_error",
	"message": (
	f"Connection timed out reaching {url}. "
	"The HF Space may be cold-starting — retry in 30 seconds."
	),
	}
	},
	)
	except httpx.RequestError as exc:
	logger.warning("RequestError on non-streaming request to %s: %s", url, exc)
	return JSONResponse(
	status_code=502,
	content={
	"error": {
	"type": "upstream_error",
	"message": f"Failed to reach upstream: {type(exc).__name__}",
	}
	},
	)

	return Response(
	content=upstream.content,
	status_code=upstream.status_code,
	media_type=upstream.headers.get("content-type", "application/json"),
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# Routes
	# ─────────────────────────────────────────────────────────────────────────────


	@app.get("/")
	async def root() -> JSONResponse:
	"""
	Human-readable status page and HF Space health-check handler.

	Returns
	-------
	JSONResponse
	HTTP 200 with service status and the active routing configuration.

	Notes
	-----
	User note — The ``timeouts`` field shows read timeouts in seconds
	per path. ``path2`` corresponds to the ai-model Space (CPU inference,
	default 600 s). ``path3`` corresponds to the HF Serverless API (GPU,
	default 120 s).
	"""
	return JSONResponse(
	{
	"status": "ok",
	"service": f"sphinx-ai-assistant proxy v{PROXY_VERSION}",
	"routing": {
	"path_1_backend_url": BACKEND_URL or None,
	"path_2_model_space_url": HF_SPACES_MODEL_URL or None,
	"path_2_namespaces": list(HF_SPACES_MODEL_NAMESPACES),
	"path_3_hf_api_base": HF_BASE,
	"path_3_hf_token_set": bool(HF_TOKEN),
	},
	"timeouts": {
	"path1_s": _proxy_timeout_secs,
	"path2_s": _path2_timeout_secs,
	"path3_s": _path3_timeout_secs,
	"connect_s": _connect_timeout_secs,
	"write_s": _write_timeout_secs,
	},
	"cors_origins": _allowed_origins,
	"endpoints": {
	"chat": "POST /v1/chat/completions (primary)",
	"share": "POST /v1/share (conversation share)",
	"share_get": "GET /v1/share/{uuid} (retrieve shared snapshot)",
	"feedback": "POST /v1/feedback (rating persistence)",
	"training": "POST /v1/contribute (GDPR-gated training data)",
	"alias": "POST / (path-agnostic alias)",
	"health": "GET /health (liveness probe)",
	},
	}
	)


	@app.get("/health")
	async def health() -> JSONResponse:
	"""
	Minimal liveness probe for container orchestrators and uptime monitors.

	Returns
	-------
	JSONResponse
	Always HTTP 200 while the process is running.
	"""
	return JSONResponse({"status": "ok", "version": PROXY_VERSION})


	@app.post("/v1/chat/completions")
	async def chat_completions(body: bytes = Depends(_validated_body)) -> Response:
	"""
	Primary proxy endpoint — OpenAI-compatible ``/v1/chat/completions``.

	Parameters
	----------
	body : bytes
	Raw request body, pre-validated by :func:`_validated_body`.

	Returns
	-------
	fastapi.Response
	Upstream response. SSE streaming preserved when ``"stream": true``.

	Notes
	-----
	User note — Set ``endpoint`` in ``conf.py`` to::

	"https://scikit-plots-ai.hf.space/v1/chat/completions"

	User note — Model routing:

	* ``scikit-plots/Qwen2.5-Coder-7B-Instruct`` → ai-model Space (Path 2,
	CPU inference, up to 5 minutes per response).
	* ``openai/gpt-oss-20b``, ``Qwen/Qwen2.5-Coder-7B-Instruct`` →
	HF Serverless Inference API (Path 3, GPU, typically 30-90 s).

	See Also
	--------
	chat_completions_alias : ``POST /`` path-agnostic alias.
	"""
	return await _forward(body)


	@app.post("/")
	async def chat_completions_alias(body: bytes = Depends(_validated_body)) -> Response:
	"""
	Path-agnostic alias: ``POST /`` → identical to ``POST /v1/chat/completions``.

	Parameters
	----------
	body : bytes
	Raw request body, pre-validated by :func:`_validated_body`.

	Returns
	-------
	fastapi.Response
	Identical to :func:`chat_completions`.

	Notes
	-----
	User note — Prefer the explicit ``/v1/chat/completions`` path.
	This alias handles ``conf.py`` configurations that set ``endpoint``
	to the bare Space URL without the path suffix.
	"""
	return await _forward(body)


	@app.post("/v1/contribute")
	async def contribute(request: Request) -> JSONResponse:
	"""Accept a training data contribution from the AI assistant browser widget.

	Parameters
	----------
	request : fastapi.Request
	HTTP request. Body must be JSON conforming to the contribution schema.

	Returns
	-------
	fastapi.responses.JSONResponse
	``{"contributed": true, "rows": N}`` on success.

	Raises
	------
	fastapi.HTTPException
	422 when consent is absent/false, schemaVersion is unsupported, or
	records exceed the maximum allowed count.
	429 when the IP rate limit is exceeded.
	503 when the HF Dataset push fails.

	Notes
	-----
	Developer: ``consentFlag`` and ``consentVersion`` are checked before any
	other validation. A missing or mismatched consent version is a hard
	rejection — the UI must always send the current version string so that
	old cached pages cannot submit records that would silently bypass a
	consent-text update.

	Developer: The in-memory rate-limit store (``_contrib_rl``) is per-process
	only. HF Spaces may run multiple replicas. The limit (5 per hour) is
	intentionally loose; the primary defence is the GDPR consent gate.

	Developer: ``huggingface_hub.HfApi.commit`` is called synchronously inside
	an async handler. This blocks the event loop for the duration of the HTTP
	round-trip to HF (~200 ms on a warm connection). For the current traffic
	level this is acceptable; if throughput grows, wrap in
	``asyncio.get_event_loop().run_in_executor(None, ...)`` instead.
	"""
	import time as _time

	# Body size guard
	raw = await request.body()
	if len(raw) > DEFAULT_MAX_BODY_BYTES:
	raise HTTPException(status_code=413, detail="Payload too large.")
	try:
	payload = json.loads(raw)
	except json.JSONDecodeError as exc:
	raise HTTPException(status_code=400, detail=f"Invalid JSON: {exc}") from exc

	# Consent guard — GDPR Article 7: explicit consent required
	if not payload.get("consentFlag"):
	raise HTTPException(
	status_code=422,
	detail="consentFlag must be true. Contribution requires explicit user consent.",
	)
	if payload.get("consentVersion") != TRAINING_CONSENT_VERSION:
	raise HTTPException(
	status_code=422,
	detail=(
	f"consentVersion {payload.get('consentVersion')!r} is not current. "
	f"Expected {TRAINING_CONSENT_VERSION!r}. Reload the page and try again."
	),
	)

	# Schema version guard
	supported_versions: frozenset[int] = frozenset({1})
	if payload.get("schemaVersion") not in supported_versions:
	raise HTTPException(
	status_code=422,
	detail=(
	f"Unsupported schemaVersion {payload.get('schemaVersion')!r}. "
	f"Supported: {sorted(supported_versions)}"
	),
	)

	records = payload.get("records", [])
	if not isinstance(records, list):
	raise HTTPException(status_code=422, detail="records must be a list.")
	if len(records) > MAX_CONTRIBUTION_RECORDS:
	raise HTTPException(
	status_code=422,
	detail=f"Too many records. Maximum {MAX_CONTRIBUTION_RECORDS} per request.",
	)

	# Per-IP rate limit: 5 contributions per hour
	client_ip = _client_ip(request)
	async with _contrib_rl_lock:
	now = _time.time()
	count, window_start = _contrib_rl.get(client_ip, (0, now))
	if now - window_start > 3600:
	count, window_start = 0, now
	count += 1
	_contrib_rl[client_ip] = (count, window_start)
	if count > 5:
	logger.warning(json.dumps({"event": "contribute.ratelimit", "ip": client_ip}))
	raise HTTPException(
	status_code=429,
	detail="Rate limit exceeded. Maximum 5 contributions per hour.",
	headers={"Retry-After": "3600"},
	)

	if not TRAINING_DATASET_REPO:
	raise HTTPException(
	status_code=503,
	detail="Training endpoint not configured (TRAINING_DATASET_REPO not set).",
	)
	if not HF_TOKEN:
	raise HTTPException(status_code=503, detail="HF_TOKEN not set.")

	# Push to HF Dataset
	from huggingface_hub import CommitOperationAdd, HfApi

	api = HfApi(token=HF_TOKEN)
	rows_jsonl = "\n".join(
	json.dumps(
	{
	**rec,
	"_sessionId": payload.get("sessionId", ""),
	"_page": payload.get("page", ""),
	"_model": payload.get("model"),
	"_consentVersion": payload.get("consentVersion"),
	"_ts": int(_time.time() * 1000),
	},
	ensure_ascii=False,
	)
	for rec in records
	if isinstance(rec, dict)
	)
	filename = f"contributions/{int(_time.time() * 1000)}.jsonl"
	try:
	api.commit(
	repo_id=TRAINING_DATASET_REPO,
	repo_type="dataset",
	operations=[
	CommitOperationAdd(
	path_in_repo=filename,
	path_or_fileobj=rows_jsonl.encode(),
	)
	],
	commit_message=f"Add {len(records)} feedback record(s)",
	)
	except Exception as exc:
	logger.error(json.dumps({"event": "contribute.hf_fail", "error": str(exc)}))
	raise HTTPException(
	status_code=503,
	detail="Failed to store contribution. Try again later.",
	) from exc

	logger.info(json.dumps({"event": "contribute.write", "rows": len(records), "ip": client_ip}))
	return JSONResponse({"contributed": True, "rows": len(records)})


	@app.post("/v1/share")
	async def share(request: Request) -> JSONResponse:
	"""Accept a conversation snapshot for global sharing.

	Parameters
	----------
	request : fastapi.Request
	HTTP request. Body must be JSON with at minimum a ``content`` field.

	Returns
	-------
	fastapi.responses.JSONResponse
	``{"uuid": "<hex>", "url": "<share_url>", "expiresAt": "<ISO-8601>"}``
	on success.

	Raises
	------
	fastapi.HTTPException
	400 when the body is not valid JSON.
	413 when the body exceeds :data:`~_shared_logic.DEFAULT_MAX_BODY_BYTES`.
	422 when ``content`` is missing or empty.
	429 when the IP rate limit (10/hour) is exceeded.

	Notes
	-----
	User note — Stored shares are ephemeral: they live only for the lifetime
	of the process. A Space restart (or scale-to-zero cold start) clears all
	shares. For persistent storage, set ``TRAINING_DATASET_REPO`` and use the
	``/v1/contribute`` endpoint instead.

	User note — The returned ``url`` points to
	``GET /v1/share/{uuid}`` on this proxy. Distribute that link to share the
	conversation.

	Developer note — ``ttlDays`` is clamped to ``[1, 365]``. Expired
	entries are evicted lazily: on every write, entries whose
	``expiresAt_ts`` has passed are removed before the new entry is stored.
	This keeps memory bounded without a background task.

	Developer note — Rate limit (10/hour per IP) is intentionally more
	permissive than ``/v1/contribute`` because share payloads are user-facing
	outputs, not GDPR-sensitive training data.
	"""
	import time as _time

	# Body size guard
	raw = await request.body()
	if len(raw) > DEFAULT_MAX_BODY_BYTES:
	raise HTTPException(status_code=413, detail="Payload too large.")

	try:
	payload = json.loads(raw)
	except json.JSONDecodeError as exc:
	raise HTTPException(status_code=400, detail=f"Invalid JSON: {exc}") from exc

	# Required field: content
	content = payload.get("content", "")
	if not content or not isinstance(content, str):
	raise HTTPException(
	status_code=422,
	detail="content is required and must be a non-empty string.",
	)

	mime_type: str = payload.get("mimeType") or "text/html;charset=utf-8"
	ext: str = payload.get("ext") or ".html"
	title: str = payload.get("title") or "Shared conversation"
	ttl_days: int = max(1, min(int(payload.get("ttlDays") or 30), 365))

	# Per-IP rate limit: 10 shares per hour
	client_ip = _client_ip(request)
	async with _share_rl_lock:
	now = _time.time()
	count, window_start = _share_rl.get(client_ip, (0, now))
	if now - window_start > 3600:
	count, window_start = 0, now
	count += 1
	_share_rl[client_ip] = (count, window_start)
	if count > 10:
	logger.warning(json.dumps({"event": "share.ratelimit", "ip": client_ip}))
	raise HTTPException(
	status_code=429,
	detail="Rate limit exceeded. Maximum 10 shares per hour.",
	headers={"Retry-After": "3600"},
	)

	share_id: str = uuid.uuid4().hex
	now_ts = _time.time()
	expires_ts = now_ts + ttl_days * 86400
	expires_iso = _time.strftime("%Y-%m-%dT%H:%M:%SZ", _time.gmtime(expires_ts))

	# Store in memory; evict expired entries lazily on each write.
	async with _share_store_lock:
	expired_keys = [k for k, v in _share_store.items() if v["expiresAt_ts"] < now_ts]
	for k in expired_keys:
	del _share_store[k]
	_share_store[share_id] = {
	"content": content,
	"mimeType": mime_type,
	"ext": ext,
	"title": title,
	"expiresAt_ts": expires_ts,
	"expiresAt": expires_iso,
	}

	logger.info(
	json.dumps(
	{"event": "share.create", "id": share_id, "ip": client_ip, "ttl_days": ttl_days}
	)
	)

	base_url = str(request.base_url).rstrip("/")
	share_url = f"{base_url}/v1/share/{share_id}"

	return JSONResponse({"uuid": share_id, "url": share_url, "expiresAt": expires_iso})


	@app.get("/v1/share/{share_id}")
	async def share_get(share_id: str) -> Response:
	"""Retrieve a shared conversation snapshot by UUID.

	Parameters
	----------
	share_id : str
	UUID hex string returned by ``POST /v1/share``.

	Returns
	-------
	fastapi.Response
	The stored content with its original MIME type and a
	``Content-Disposition: inline`` header.

	Raises
	------
	fastapi.HTTPException
	404 when the UUID is not found.
	410 when the share exists but has expired.

	Notes
	-----
	User note — Shares are stored in process memory only. A Space
	restart clears all shares, returning 404 for previously valid links.
	"""
	import time as _time

	async with _share_store_lock:
	entry = _share_store.get(share_id)

	if entry is None:
	raise HTTPException(status_code=404, detail="Share not found or expired.")

	if entry["expiresAt_ts"] < _time.time():
	async with _share_store_lock:
	_share_store.pop(share_id, None)
	raise HTTPException(status_code=410, detail="Share has expired.")

	return Response(
	content=entry["content"],
	status_code=200,
	media_type=entry.get("mimeType", "text/html;charset=utf-8"),
	headers={
	"Content-Disposition": f'inline; filename="share{entry.get("ext", ".html")}"',
	},
	)


	@app.post("/v1/feedback")
	async def feedback(request: Request) -> JSONResponse:
	"""Accept a thumbs-up/down feedback record from the AI assistant widget.

	Parameters
	----------
	request : fastapi.Request
	HTTP request. Body must be valid JSON; all fields are optional.
	Typical payload fields from the JS widget:

	* ``ratingValue`` — ``"thumbs-up"`` or ``"thumbs-down"``
	* ``ratingLabel`` — human-readable label (e.g. ``"Helpful"``)
	* ``message`` — optional free-text comment from the user
	* ``query`` — the question that was asked
	* ``answer`` — the response that was rated
	* ``model`` — model ID that produced the answer
	* ``sessionId`` — browser session UUID
	* ``page`` — originating documentation page URL
	* ``ts`` — client-side timestamp (ms since epoch)

	Returns
	-------
	fastapi.responses.JSONResponse
	``{"ok": true}`` on success.

	Raises
	------
	fastapi.HTTPException
	400 when the body is not valid JSON.
	413 when the body exceeds :data:`~_shared_logic.DEFAULT_MAX_BODY_BYTES`.
	429 when the IP rate limit (30/hour) is exceeded.

	Notes
	-----
	User note — Feedback is logged to the Space log stream but is NOT
	persisted to a dataset. To persist feedback, enable the ``/v1/contribute``
	endpoint by setting ``TRAINING_DATASET_REPO``.

	Developer note — The JS widget sends feedback with ``keepalive: true``
	so the request survives page unload. The handler therefore needs no
	response body beyond ``{"ok": true}``; the widget does not read it.

	Developer note — Rate limit (30/hour per IP) is set high because
	users commonly rate multiple answers in a session. The limit prevents
	programmatic flooding while allowing normal interactive use.
	"""
	import time as _time

	# Body size guard
	raw = await request.body()
	if len(raw) > DEFAULT_MAX_BODY_BYTES:
	raise HTTPException(status_code=413, detail="Payload too large.")

	try:
	payload = json.loads(raw)
	except json.JSONDecodeError as exc:
	raise HTTPException(status_code=400, detail=f"Invalid JSON: {exc}") from exc

	# Per-IP rate limit: 30 per hour
	client_ip = _client_ip(request)
	async with _feedback_rl_lock:
	now = _time.time()
	count, window_start = _feedback_rl.get(client_ip, (0, now))
	if now - window_start > 3600:
	count, window_start = 0, now
	count += 1
	_feedback_rl[client_ip] = (count, window_start)
	if count > 30:
	logger.warning(json.dumps({"event": "feedback.ratelimit", "ip": client_ip}))
	raise HTTPException(
	status_code=429,
	detail="Rate limit exceeded. Maximum 30 feedback submissions per hour.",
	headers={"Retry-After": "3600"},
	)

	if not isinstance(payload, dict):
	raise HTTPException(status_code=422, detail="Feedback body must be a JSON object.")

	logger.info(
	json.dumps(
	{
	"event": "feedback.receive",
	"ip": client_ip,
	"ratingValue": payload.get("ratingValue"),
	"model": payload.get("model"),
	"sessionId": payload.get("sessionId"),
	"page": payload.get("page"),
	"ts": payload.get("ts"),
	}
	)
	)

	return JSONResponse({"ok": True})