Buckets:

Daankular
/

models

8 days ago

1.38 kB

	from __future__ import annotations

	from typing import Any

	from shared.prompt_enhancer.qwen35_vl import _prepare_multimodal_vllm_prompt


	VISION_QA_SYSTEM_PROMPT = "Answer the user's question about the provided image accurately and concisely. If the answer is uncertain, say so."


	def build_image_question_prompt(caption_model: Any, processor: Any, image: Any, question: str, system_prompt: str \| None = None):
	question = str(question or "").strip()
	if len(question) == 0:
	raise ValueError("Vision question is empty.")
	messages = []
	system_prompt = str(system_prompt or VISION_QA_SYSTEM_PROMPT).strip()
	if len(system_prompt) > 0:
	messages.append({"role": "system", "content": system_prompt})
	messages.append(
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": question},
	],
	}
	)
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
	model_inputs = processor(
	text=[text],
	images=[image],
	return_tensors="pt",
	padding=True,
	return_mm_token_type_ids=True,
	)
	return _prepare_multimodal_vllm_prompt(caption_model, model_inputs)


	__all__ = ["VISION_QA_SYSTEM_PROMPT", "build_image_question_prompt"]

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.