Daankular's picture
download
raw
1.38 kB
from __future__ import annotations
from typing import Any
from shared.prompt_enhancer.qwen35_vl import _prepare_multimodal_vllm_prompt
VISION_QA_SYSTEM_PROMPT = "Answer the user's question about the provided image accurately and concisely. If the answer is uncertain, say so."
def build_image_question_prompt(caption_model: Any, processor: Any, image: Any, question: str, system_prompt: str | None = None):
question = str(question or "").strip()
if len(question) == 0:
raise ValueError("Vision question is empty.")
messages = []
system_prompt = str(system_prompt or VISION_QA_SYSTEM_PROMPT).strip()
if len(system_prompt) > 0:
messages.append({"role": "system", "content": system_prompt})
messages.append(
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": question},
],
}
)
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
model_inputs = processor(
text=[text],
images=[image],
return_tensors="pt",
padding=True,
return_mm_token_type_ids=True,
)
return _prepare_multimodal_vllm_prompt(caption_model, model_inputs)
__all__ = ["VISION_QA_SYSTEM_PROMPT", "build_image_question_prompt"]

Xet Storage Details

Size:
1.38 kB
·
Xet hash:
dd0846608157b538b80632f31ef53bcac18451f76b7cfc1c64a4af3592c13497

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.