ndc8
commited on
Commit
·
68f41f4
1
Parent(s):
83df634
upd
Browse files- backend_service.py +59 -82
backend_service.py
CHANGED
|
@@ -7,6 +7,8 @@ Provides OpenAI-compatible chat completion endpoints
|
|
| 7 |
import os
|
| 8 |
os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
|
| 9 |
os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/.cache/huggingface")
|
|
|
|
|
|
|
| 10 |
import asyncio
|
| 11 |
import logging
|
| 12 |
import time
|
|
@@ -191,7 +193,16 @@ async def lifespan(app: FastAPI):
|
|
| 191 |
# Initialize tokenizer for better text handling
|
| 192 |
if transformers_available and AutoTokenizer:
|
| 193 |
try:
|
| 194 |
-
tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
logger.info("✅ Tokenizer loaded successfully")
|
| 196 |
except Exception as e:
|
| 197 |
logger.warning(f"⚠️ Could not load tokenizer: {e}")
|
|
@@ -469,33 +480,49 @@ async def list_models():
|
|
| 469 |
|
| 470 |
return ModelsResponse(data=models)
|
| 471 |
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
async def create_chat_completion(
|
| 474 |
request: ChatCompletionRequest,
|
| 475 |
client: InferenceClient = Depends(get_inference_client)
|
| 476 |
-
):
|
| 477 |
-
"""Create a chat completion (OpenAI-compatible) with multimodal support"""
|
| 478 |
try:
|
| 479 |
-
# Validate request
|
| 480 |
if not request.messages:
|
| 481 |
raise HTTPException(status_code=400, detail="Messages cannot be empty")
|
| 482 |
-
|
| 483 |
-
# Check if this is a multimodal request (contains images)
|
| 484 |
is_multimodal = has_images(request.messages)
|
| 485 |
-
|
| 486 |
if is_multimodal:
|
| 487 |
-
# Handle multimodal request with image-text pipeline
|
| 488 |
if not image_text_pipeline:
|
| 489 |
raise HTTPException(status_code=503, detail="Image processing not available")
|
| 490 |
-
|
| 491 |
response_text = await generate_multimodal_response(request.messages, request)
|
| 492 |
else:
|
| 493 |
-
# Handle text-only request with existing logic
|
| 494 |
prompt = convert_messages_to_prompt(request.messages)
|
| 495 |
logger.info(f"Generated prompt: {prompt[:200]}...")
|
| 496 |
-
|
| 497 |
if request.stream:
|
| 498 |
-
# Return streaming response
|
| 499 |
return StreamingResponse(
|
| 500 |
generate_streaming_response(client, prompt, request),
|
| 501 |
media_type="text/plain",
|
|
@@ -504,37 +531,26 @@ async def create_chat_completion(
|
|
| 504 |
"Connection": "keep-alive",
|
| 505 |
"Content-Type": "text/plain; charset=utf-8"
|
| 506 |
}
|
| 507 |
-
)
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
request.top_p or 0.95
|
| 517 |
-
)
|
| 518 |
-
|
| 519 |
-
# Clean up the response
|
| 520 |
response_text = response_text.strip() if response_text else "No response generated."
|
| 521 |
-
|
| 522 |
-
# Create OpenAI-compatible response
|
| 523 |
-
response = ChatCompletionResponse(
|
| 524 |
id=f"chatcmpl-{int(time.time())}",
|
| 525 |
created=int(time.time()),
|
| 526 |
model=request.model,
|
| 527 |
-
choices=[
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
)
|
| 533 |
-
]
|
| 534 |
)
|
| 535 |
-
|
| 536 |
-
return response
|
| 537 |
-
|
| 538 |
except Exception as e:
|
| 539 |
logger.error(f"Error in chat completion: {e}")
|
| 540 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
|
@@ -548,17 +564,14 @@ async def create_completion(
|
|
| 548 |
try:
|
| 549 |
if not request.prompt:
|
| 550 |
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
| 551 |
-
|
| 552 |
-
# Generate response
|
| 553 |
response_text = await asyncio.to_thread(
|
| 554 |
generate_response_safe,
|
| 555 |
client,
|
| 556 |
request.prompt,
|
| 557 |
request.max_tokens or 512,
|
| 558 |
request.temperature or 0.7,
|
| 559 |
-
0.95
|
| 560 |
)
|
| 561 |
-
|
| 562 |
return {
|
| 563 |
"id": f"cmpl-{int(time.time())}",
|
| 564 |
"object": "text_completion",
|
|
@@ -570,57 +583,21 @@ async def create_completion(
|
|
| 570 |
"finish_reason": "stop"
|
| 571 |
}]
|
| 572 |
}
|
| 573 |
-
|
| 574 |
except Exception as e:
|
| 575 |
logger.error(f"Error in completion: {e}")
|
| 576 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 577 |
|
| 578 |
@app.post("/api/response")
|
| 579 |
-
async def api_response(request: Request):
|
| 580 |
"""Endpoint to receive and send responses via API."""
|
| 581 |
try:
|
| 582 |
data = await request.json()
|
| 583 |
message = data.get("message", "No message provided")
|
| 584 |
-
|
| 585 |
"status": "success",
|
| 586 |
"received_message": message,
|
| 587 |
"response_message": f"You sent: {message}"
|
| 588 |
-
}
|
| 589 |
-
return JSONResponse(content=response)
|
| 590 |
except Exception as e:
|
| 591 |
logger.error(f"Error processing API response: {e}")
|
| 592 |
raise HTTPException(status_code=500, detail="Internal server error")
|
| 593 |
-
|
| 594 |
-
@app.exception_handler(Exception)
|
| 595 |
-
async def global_exception_handler(request: Any, exc: Exception) -> JSONResponse:
|
| 596 |
-
"""Global exception handler"""
|
| 597 |
-
logger.error(f"Unhandled exception: {exc}")
|
| 598 |
-
return JSONResponse(
|
| 599 |
-
status_code=500,
|
| 600 |
-
content={"detail": f"Internal server error: {str(exc)}"}
|
| 601 |
-
)
|
| 602 |
-
|
| 603 |
-
if __name__ == "__main__":
|
| 604 |
-
import argparse
|
| 605 |
-
|
| 606 |
-
parser = argparse.ArgumentParser(description="AI Backend Service")
|
| 607 |
-
parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
|
| 608 |
-
parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
|
| 609 |
-
parser.add_argument("--model", default=current_model, help="HuggingFace model to use")
|
| 610 |
-
parser.add_argument("--reload", action="store_true", help="Enable auto-reload for development")
|
| 611 |
-
|
| 612 |
-
args = parser.parse_args()
|
| 613 |
-
|
| 614 |
-
if args.model != current_model:
|
| 615 |
-
current_model = args.model
|
| 616 |
-
logger.info(f"Using model: {current_model}")
|
| 617 |
-
|
| 618 |
-
logger.info(f"🚀 Starting AI Backend Service on {args.host}:{args.port}")
|
| 619 |
-
|
| 620 |
-
uvicorn.run(
|
| 621 |
-
"backend_service:app",
|
| 622 |
-
host=args.host,
|
| 623 |
-
port=args.port,
|
| 624 |
-
reload=args.reload,
|
| 625 |
-
log_level="info"
|
| 626 |
-
)
|
|
|
|
| 7 |
import os
|
| 8 |
os.environ.setdefault("HF_HOME", "/tmp/.cache/huggingface")
|
| 9 |
os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/.cache/huggingface")
|
| 10 |
+
# Define Hugging Face auth token from environment
|
| 11 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 12 |
import asyncio
|
| 13 |
import logging
|
| 14 |
import time
|
|
|
|
| 193 |
# Initialize tokenizer for better text handling
|
| 194 |
if transformers_available and AutoTokenizer:
|
| 195 |
try:
|
| 196 |
+
# Load tokenizer, using auth token if provided
|
| 197 |
+
if hf_token:
|
| 198 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 199 |
+
current_model,
|
| 200 |
+
use_auth_token=hf_token
|
| 201 |
+
) # type: ignore
|
| 202 |
+
else:
|
| 203 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 204 |
+
current_model
|
| 205 |
+
) # type: ignore
|
| 206 |
logger.info("✅ Tokenizer loaded successfully")
|
| 207 |
except Exception as e:
|
| 208 |
logger.warning(f"⚠️ Could not load tokenizer: {e}")
|
|
|
|
| 480 |
|
| 481 |
return ModelsResponse(data=models)
|
| 482 |
|
| 483 |
+
|
| 484 |
+
# Clean up the response
|
| 485 |
+
response_text = response_text.strip() if response_text else "No response generated."
|
| 486 |
+
|
| 487 |
+
# Create OpenAI-compatible response
|
| 488 |
+
response = ChatCompletionResponse(
|
| 489 |
+
id=f"chatcmpl-{int(time.time())}",
|
| 490 |
+
created=int(time.time()),
|
| 491 |
+
model=request.model,
|
| 492 |
+
choices=[
|
| 493 |
+
ChatCompletionChoice(
|
| 494 |
+
index=0,
|
| 495 |
+
message=ChatMessage(role="assistant", content=response_text),
|
| 496 |
+
finish_reason="stop"
|
| 497 |
+
)
|
| 498 |
+
]
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
return response
|
| 502 |
+
|
| 503 |
+
except Exception as e:
|
| 504 |
+
logger.error(f"Error in chat completion: {e}")
|
| 505 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 506 |
+
|
| 507 |
+
@app.post("/api/response")
|
| 508 |
+
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
|
| 509 |
async def create_chat_completion(
|
| 510 |
request: ChatCompletionRequest,
|
| 511 |
client: InferenceClient = Depends(get_inference_client)
|
| 512 |
+
) -> ChatCompletionResponse:
|
| 513 |
+
"""Create a chat completion (OpenAI-compatible) with multimodal support."""
|
| 514 |
try:
|
|
|
|
| 515 |
if not request.messages:
|
| 516 |
raise HTTPException(status_code=400, detail="Messages cannot be empty")
|
|
|
|
|
|
|
| 517 |
is_multimodal = has_images(request.messages)
|
|
|
|
| 518 |
if is_multimodal:
|
|
|
|
| 519 |
if not image_text_pipeline:
|
| 520 |
raise HTTPException(status_code=503, detail="Image processing not available")
|
|
|
|
| 521 |
response_text = await generate_multimodal_response(request.messages, request)
|
| 522 |
else:
|
|
|
|
| 523 |
prompt = convert_messages_to_prompt(request.messages)
|
| 524 |
logger.info(f"Generated prompt: {prompt[:200]}...")
|
|
|
|
| 525 |
if request.stream:
|
|
|
|
| 526 |
return StreamingResponse(
|
| 527 |
generate_streaming_response(client, prompt, request),
|
| 528 |
media_type="text/plain",
|
|
|
|
| 531 |
"Connection": "keep-alive",
|
| 532 |
"Content-Type": "text/plain; charset=utf-8"
|
| 533 |
}
|
| 534 |
+
) # type: ignore
|
| 535 |
+
response_text = await asyncio.to_thread(
|
| 536 |
+
generate_response_safe,
|
| 537 |
+
client,
|
| 538 |
+
prompt,
|
| 539 |
+
request.max_tokens or 512,
|
| 540 |
+
request.temperature or 0.7,
|
| 541 |
+
request.top_p or 0.95
|
| 542 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
response_text = response_text.strip() if response_text else "No response generated."
|
| 544 |
+
return ChatCompletionResponse(
|
|
|
|
|
|
|
| 545 |
id=f"chatcmpl-{int(time.time())}",
|
| 546 |
created=int(time.time()),
|
| 547 |
model=request.model,
|
| 548 |
+
choices=[ChatCompletionChoice(
|
| 549 |
+
index=0,
|
| 550 |
+
message=ChatMessage(role="assistant", content=response_text),
|
| 551 |
+
finish_reason="stop"
|
| 552 |
+
)]
|
|
|
|
|
|
|
| 553 |
)
|
|
|
|
|
|
|
|
|
|
| 554 |
except Exception as e:
|
| 555 |
logger.error(f"Error in chat completion: {e}")
|
| 556 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
|
|
|
| 564 |
try:
|
| 565 |
if not request.prompt:
|
| 566 |
raise HTTPException(status_code=400, detail="Prompt cannot be empty")
|
|
|
|
|
|
|
| 567 |
response_text = await asyncio.to_thread(
|
| 568 |
generate_response_safe,
|
| 569 |
client,
|
| 570 |
request.prompt,
|
| 571 |
request.max_tokens or 512,
|
| 572 |
request.temperature or 0.7,
|
| 573 |
+
0.95
|
| 574 |
)
|
|
|
|
| 575 |
return {
|
| 576 |
"id": f"cmpl-{int(time.time())}",
|
| 577 |
"object": "text_completion",
|
|
|
|
| 583 |
"finish_reason": "stop"
|
| 584 |
}]
|
| 585 |
}
|
|
|
|
| 586 |
except Exception as e:
|
| 587 |
logger.error(f"Error in completion: {e}")
|
| 588 |
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
| 589 |
|
| 590 |
@app.post("/api/response")
|
| 591 |
+
async def api_response(request: Request) -> JSONResponse:
|
| 592 |
"""Endpoint to receive and send responses via API."""
|
| 593 |
try:
|
| 594 |
data = await request.json()
|
| 595 |
message = data.get("message", "No message provided")
|
| 596 |
+
return JSONResponse(content={
|
| 597 |
"status": "success",
|
| 598 |
"received_message": message,
|
| 599 |
"response_message": f"You sent: {message}"
|
| 600 |
+
})
|
|
|
|
| 601 |
except Exception as e:
|
| 602 |
logger.error(f"Error processing API response: {e}")
|
| 603 |
raise HTTPException(status_code=500, detail="Internal server error")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|