ndc8
commited on
Commit
·
8d9c495
1
Parent(s):
3239c69
upd
Browse files- backend_service.py +52 -93
backend_service.py
CHANGED
|
@@ -15,7 +15,6 @@ warnings.filterwarnings("ignore", message=".*rope_scaling.*")
|
|
| 15 |
os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
|
| 16 |
# Suppress advisory warnings from transformers (including deprecation warnings)
|
| 17 |
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
|
| 18 |
-
# Define Hugging Face auth token from environment
|
| 19 |
hf_token = os.environ.get("HF_TOKEN")
|
| 20 |
import asyncio
|
| 21 |
import logging
|
|
@@ -28,10 +27,11 @@ from fastapi import FastAPI, HTTPException, Depends, Request
|
|
| 28 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 29 |
from fastapi.middleware.cors import CORSMiddleware
|
| 30 |
from pydantic import BaseModel, Field, field_validator
|
| 31 |
-
|
| 32 |
import uvicorn
|
| 33 |
import requests
|
| 34 |
from PIL import Image
|
|
|
|
| 35 |
|
| 36 |
# Transformers imports (now required)
|
| 37 |
try:
|
|
@@ -128,12 +128,13 @@ class CompletionRequest(BaseModel):
|
|
| 128 |
max_tokens: Optional[int] = Field(default=512, ge=1, le=2048)
|
| 129 |
temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
|
| 130 |
|
|
|
|
| 131 |
# Global variables for model management
|
| 132 |
-
inference_client: Optional[InferenceClient] = None
|
| 133 |
-
image_text_pipeline = None # type: ignore
|
| 134 |
current_model = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
|
| 135 |
vision_model = "Salesforce/blip-image-captioning-base" # Working model for image captioning
|
| 136 |
tokenizer = None
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# Image processing utilities
|
| 139 |
async def download_image(url: str) -> Image.Image:
|
|
@@ -173,23 +174,22 @@ def has_images(messages: List[ChatMessage]) -> bool:
|
|
| 173 |
return True
|
| 174 |
return False
|
| 175 |
|
|
|
|
| 176 |
@asynccontextmanager
|
| 177 |
async def lifespan(app: FastAPI):
|
| 178 |
"""Application lifespan manager for startup and shutdown events"""
|
| 179 |
-
global
|
| 180 |
-
|
| 181 |
-
# Startup
|
| 182 |
logger.info("🚀 Starting AI Backend Service...")
|
| 183 |
try:
|
| 184 |
-
#
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
#
|
| 189 |
if transformers_available and pipeline:
|
| 190 |
try:
|
| 191 |
logger.info(f"🖼️ Initializing image captioning pipeline with model: {vision_model}")
|
| 192 |
-
image_text_pipeline = pipeline("image-to-text", model=vision_model)
|
| 193 |
logger.info("✅ Image captioning pipeline loaded successfully")
|
| 194 |
except Exception as e:
|
| 195 |
logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
|
|
@@ -197,37 +197,13 @@ async def lifespan(app: FastAPI):
|
|
| 197 |
else:
|
| 198 |
logger.warning("⚠️ Transformers not available, image processing disabled")
|
| 199 |
image_text_pipeline = None
|
| 200 |
-
|
| 201 |
-
# Initialize tokenizer for better text handling
|
| 202 |
-
if transformers_available and AutoTokenizer:
|
| 203 |
-
try:
|
| 204 |
-
# Load tokenizer, using auth token if provided
|
| 205 |
-
if hf_token:
|
| 206 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 207 |
-
current_model,
|
| 208 |
-
token=hf_token
|
| 209 |
-
) # type: ignore
|
| 210 |
-
else:
|
| 211 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 212 |
-
current_model
|
| 213 |
-
) # type: ignore
|
| 214 |
-
logger.info("✅ Tokenizer loaded successfully")
|
| 215 |
-
except Exception as e:
|
| 216 |
-
logger.warning(f"⚠️ Could not load tokenizer: {e}")
|
| 217 |
-
tokenizer = None
|
| 218 |
-
else:
|
| 219 |
-
logger.info("⚠️ Tokenizer initialization skipped")
|
| 220 |
-
|
| 221 |
except Exception as e:
|
| 222 |
-
logger.error(f"❌ Failed to initialize
|
| 223 |
raise RuntimeError(f"Service initialization failed: {e}")
|
| 224 |
-
|
| 225 |
yield
|
| 226 |
-
|
| 227 |
-
# Shutdown
|
| 228 |
logger.info("🔄 Shutting down AI Backend Service...")
|
| 229 |
-
inference_client = None
|
| 230 |
tokenizer = None
|
|
|
|
| 231 |
image_text_pipeline = None
|
| 232 |
|
| 233 |
# Initialize FastAPI app
|
|
@@ -247,11 +223,10 @@ app.add_middleware(
|
|
| 247 |
allow_headers=["*"],
|
| 248 |
)
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
if
|
| 253 |
-
raise HTTPException(status_code=503, detail="Service not ready -
|
| 254 |
-
return inference_client
|
| 255 |
|
| 256 |
def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
|
| 257 |
"""Convert OpenAI messages format to a single prompt string"""
|
|
@@ -341,36 +316,30 @@ async def generate_multimodal_response(
|
|
| 341 |
logger.error(f"Error in multimodal generation: {e}")
|
| 342 |
return f"I'm having trouble processing the image. Error: {str(e)}"
|
| 343 |
|
| 344 |
-
|
| 345 |
-
|
|
|
|
|
|
|
| 346 |
try:
|
| 347 |
-
#
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
|
|
|
| 355 |
)
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
| 358 |
except Exception as e:
|
| 359 |
-
logger.
|
| 360 |
-
|
| 361 |
-
# Method 2: Try with minimal parameters
|
| 362 |
-
try:
|
| 363 |
-
response_text = client.text_generation(
|
| 364 |
-
prompt=prompt,
|
| 365 |
-
max_new_tokens=max_tokens,
|
| 366 |
-
temperature=temperature,
|
| 367 |
-
return_full_text=False
|
| 368 |
-
)
|
| 369 |
-
return response_text.strip() if response_text else "I apologize, but I couldn't generate a response."
|
| 370 |
-
|
| 371 |
-
except Exception as e2:
|
| 372 |
-
logger.error(f"All generation methods failed: {e2}")
|
| 373 |
-
return "I apologize, but I'm having trouble generating a response right now. Please try again."
|
| 374 |
|
| 375 |
async def generate_streaming_response(
|
| 376 |
client: InferenceClient,
|
|
@@ -491,10 +460,10 @@ async def list_models():
|
|
| 491 |
|
| 492 |
# ...existing code...
|
| 493 |
|
|
|
|
| 494 |
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
|
| 495 |
async def create_chat_completion(
|
| 496 |
-
request: ChatCompletionRequest
|
| 497 |
-
client: InferenceClient = Depends(get_inference_client)
|
| 498 |
) -> ChatCompletionResponse:
|
| 499 |
"""Create a chat completion (OpenAI-compatible) with multimodal support."""
|
| 500 |
try:
|
|
@@ -506,22 +475,10 @@ async def create_chat_completion(
|
|
| 506 |
raise HTTPException(status_code=503, detail="Image processing not available")
|
| 507 |
response_text = await generate_multimodal_response(request.messages, request)
|
| 508 |
else:
|
| 509 |
-
|
| 510 |
-
logger.info(f"Generated prompt: {prompt[:200]}...")
|
| 511 |
-
if request.stream:
|
| 512 |
-
return StreamingResponse(
|
| 513 |
-
generate_streaming_response(client, prompt, request),
|
| 514 |
-
media_type="text/plain",
|
| 515 |
-
headers={
|
| 516 |
-
"Cache-Control": "no-cache",
|
| 517 |
-
"Connection": "keep-alive",
|
| 518 |
-
"Content-Type": "text/plain; charset=utf-8"
|
| 519 |
-
}
|
| 520 |
-
) # type: ignore
|
| 521 |
response_text = await asyncio.to_thread(
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
prompt,
|
| 525 |
request.max_tokens or 512,
|
| 526 |
request.temperature or 0.7,
|
| 527 |
request.top_p or 0.95
|
|
@@ -542,19 +499,21 @@ async def create_chat_completion(
|
|
| 542 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 543 |
|
| 544 |
|
|
|
|
| 545 |
@app.post("/v1/completions")
|
| 546 |
async def create_completion(
|
| 547 |
-
request: CompletionRequest
|
| 548 |
-
client: InferenceClient = Depends(get_inference_client)
|
| 549 |
) -> Dict[str, Any]:
|
| 550 |
"""Create a text completion (OpenAI-compatible)"""
|
| 551 |
try:
|
| 552 |
if not request.prompt:
|
| 553 |
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
|
|
|
|
|
|
|
|
|
| 554 |
response_text = await asyncio.to_thread(
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
request.prompt,
|
| 558 |
request.max_tokens or 512,
|
| 559 |
request.temperature or 0.7,
|
| 560 |
0.95
|
|
|
|
| 15 |
os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
|
| 16 |
# Suppress advisory warnings from transformers (including deprecation warnings)
|
| 17 |
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
|
|
|
|
| 18 |
hf_token = os.environ.get("HF_TOKEN")
|
| 19 |
import asyncio
|
| 20 |
import logging
|
|
|
|
| 27 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 28 |
from fastapi.middleware.cors import CORSMiddleware
|
| 29 |
from pydantic import BaseModel, Field, field_validator
|
| 30 |
+
|
| 31 |
import uvicorn
|
| 32 |
import requests
|
| 33 |
from PIL import Image
|
| 34 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 35 |
|
| 36 |
# Transformers imports (now required)
|
| 37 |
try:
|
|
|
|
| 128 |
max_tokens: Optional[int] = Field(default=512, ge=1, le=2048)
|
| 129 |
temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
|
| 130 |
|
| 131 |
+
|
| 132 |
# Global variables for model management
|
|
|
|
|
|
|
| 133 |
current_model = "unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF"
|
| 134 |
vision_model = "Salesforce/blip-image-captioning-base" # Working model for image captioning
|
| 135 |
tokenizer = None
|
| 136 |
+
model = None
|
| 137 |
+
image_text_pipeline = None # type: ignore
|
| 138 |
|
| 139 |
# Image processing utilities
|
| 140 |
async def download_image(url: str) -> Image.Image:
|
|
|
|
| 174 |
return True
|
| 175 |
return False
|
| 176 |
|
| 177 |
+
|
| 178 |
@asynccontextmanager
|
| 179 |
async def lifespan(app: FastAPI):
|
| 180 |
"""Application lifespan manager for startup and shutdown events"""
|
| 181 |
+
global tokenizer, model, image_text_pipeline
|
|
|
|
|
|
|
| 182 |
logger.info("🚀 Starting AI Backend Service...")
|
| 183 |
try:
|
| 184 |
+
# Load local tokenizer and model
|
| 185 |
+
tokenizer = AutoTokenizer.from_pretrained(current_model)
|
| 186 |
+
model = AutoModelForCausalLM.from_pretrained(current_model)
|
| 187 |
+
logger.info(f"✅ Loaded local model and tokenizer: {current_model}")
|
| 188 |
+
# Optionally, load image pipeline as before
|
| 189 |
if transformers_available and pipeline:
|
| 190 |
try:
|
| 191 |
logger.info(f"🖼️ Initializing image captioning pipeline with model: {vision_model}")
|
| 192 |
+
image_text_pipeline = pipeline("image-to-text", model=vision_model)
|
| 193 |
logger.info("✅ Image captioning pipeline loaded successfully")
|
| 194 |
except Exception as e:
|
| 195 |
logger.warning(f"⚠️ Could not load image captioning pipeline: {e}")
|
|
|
|
| 197 |
else:
|
| 198 |
logger.warning("⚠️ Transformers not available, image processing disabled")
|
| 199 |
image_text_pipeline = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
except Exception as e:
|
| 201 |
+
logger.error(f"❌ Failed to initialize local model: {e}")
|
| 202 |
raise RuntimeError(f"Service initialization failed: {e}")
|
|
|
|
| 203 |
yield
|
|
|
|
|
|
|
| 204 |
logger.info("🔄 Shutting down AI Backend Service...")
|
|
|
|
| 205 |
tokenizer = None
|
| 206 |
+
model = None
|
| 207 |
image_text_pipeline = None
|
| 208 |
|
| 209 |
# Initialize FastAPI app
|
|
|
|
| 223 |
allow_headers=["*"],
|
| 224 |
)
|
| 225 |
|
| 226 |
+
|
| 227 |
+
def ensure_model_ready():
|
| 228 |
+
if tokenizer is None or model is None:
|
| 229 |
+
raise HTTPException(status_code=503, detail="Service not ready - model not initialized")
|
|
|
|
| 230 |
|
| 231 |
def convert_messages_to_prompt(messages: List[ChatMessage]) -> str:
|
| 232 |
"""Convert OpenAI messages format to a single prompt string"""
|
|
|
|
| 316 |
logger.error(f"Error in multimodal generation: {e}")
|
| 317 |
return f"I'm having trouble processing the image. Error: {str(e)}"
|
| 318 |
|
| 319 |
+
|
| 320 |
+
def generate_response_local(messages: List[ChatMessage], max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
|
| 321 |
+
"""Generate response using local model and tokenizer with chat template."""
|
| 322 |
+
ensure_model_ready()
|
| 323 |
try:
|
| 324 |
+
# Convert messages to OpenAI format for chat template
|
| 325 |
+
chat_messages = []
|
| 326 |
+
for m in messages:
|
| 327 |
+
chat_messages.append({"role": m.role, "content": m.content if isinstance(m.content, str) else extract_text_and_images(m.content)[0]})
|
| 328 |
+
inputs = tokenizer.apply_chat_template(
|
| 329 |
+
chat_messages,
|
| 330 |
+
add_generation_prompt=True,
|
| 331 |
+
tokenize=True,
|
| 332 |
+
return_dict=True,
|
| 333 |
+
return_tensors="pt",
|
| 334 |
)
|
| 335 |
+
inputs = inputs.to(model.device)
|
| 336 |
+
outputs = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_p=top_p)
|
| 337 |
+
# Only decode the newly generated tokens
|
| 338 |
+
generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
|
| 339 |
+
return generated.strip()
|
| 340 |
except Exception as e:
|
| 341 |
+
logger.error(f"Local generation failed: {e}")
|
| 342 |
+
return "I apologize, but I'm having trouble generating a response right now. Please try again."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
async def generate_streaming_response(
|
| 345 |
client: InferenceClient,
|
|
|
|
| 460 |
|
| 461 |
# ...existing code...
|
| 462 |
|
| 463 |
+
|
| 464 |
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
|
| 465 |
async def create_chat_completion(
|
| 466 |
+
request: ChatCompletionRequest
|
|
|
|
| 467 |
) -> ChatCompletionResponse:
|
| 468 |
"""Create a chat completion (OpenAI-compatible) with multimodal support."""
|
| 469 |
try:
|
|
|
|
| 475 |
raise HTTPException(status_code=503, detail="Image processing not available")
|
| 476 |
response_text = await generate_multimodal_response(request.messages, request)
|
| 477 |
else:
|
| 478 |
+
logger.info(f"Generating local response for messages: {request.messages}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
response_text = await asyncio.to_thread(
|
| 480 |
+
generate_response_local,
|
| 481 |
+
request.messages,
|
|
|
|
| 482 |
request.max_tokens or 512,
|
| 483 |
request.temperature or 0.7,
|
| 484 |
request.top_p or 0.95
|
|
|
|
| 499 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 500 |
|
| 501 |
|
| 502 |
+
|
| 503 |
@app.post("/v1/completions")
|
| 504 |
async def create_completion(
|
| 505 |
+
request: CompletionRequest
|
|
|
|
| 506 |
) -> Dict[str, Any]:
|
| 507 |
"""Create a text completion (OpenAI-compatible)"""
|
| 508 |
try:
|
| 509 |
if not request.prompt:
|
| 510 |
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
| 511 |
+
ensure_model_ready()
|
| 512 |
+
# Use the prompt as a single user message
|
| 513 |
+
messages = [ChatMessage(role="user", content=request.prompt)]
|
| 514 |
response_text = await asyncio.to_thread(
|
| 515 |
+
generate_response_local,
|
| 516 |
+
messages,
|
|
|
|
| 517 |
request.max_tokens or 512,
|
| 518 |
request.temperature or 0.7,
|
| 519 |
0.95
|