Instructions to use amete7/qvla with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use amete7/qvla with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="amete7/qvla", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("amete7/qvla", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use amete7/qvla with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "amete7/qvla"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "amete7/qvla",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/amete7/qvla

SGLang

How to use amete7/qvla with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "amete7/qvla" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "amete7/qvla",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "amete7/qvla" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "amete7/qvla",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use amete7/qvla with Docker Model Runner:
```
docker model run hf.co/amete7/qvla
```

qvla / preprocessing_molmo.py

Atharva Mete

vla added but giving nans in loss

57b4d23 over 1 year ago

raw

history blame contribute delete

7.29 kB

	"""
	Processor class for Molmo.
	"""

	from typing import Optional

	import PIL
	from PIL import ImageOps
	from PIL.Image import Image

	try:
	from typing import Unpack
	except ImportError:
	from typing_extensions import Unpack

	import numpy as np
	import torch

	from transformers.image_utils import ImageInput
	from transformers.processing_utils import (
	TextKwargs,
	ProcessingKwargs,
	ProcessorMixin,
	)

	from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
	from transformers.utils import logging

	from transformers import AutoTokenizer
	from .image_preprocessing_molmo import MolmoImagesKwargs, MolmoImageProcessor
	from typing import List, Union

	logger = logging.get_logger(__name__)


	DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
	DEFAULT_IM_START_TOKEN = f"<im_start>"
	DEFAULT_IM_END_TOKEN = f"<im_end>"
	DEFAULT_IM_COL_TOKEN = f"<im_col>"
	IMAGE_PROMPT = "<\|image\|>"
	PROPRIO_PROMPT = "<\|proprio\|>"
	SKILL_PROMPT = "<\|skill\|>"

	EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT, PROPRIO_PROMPT, SKILL_PROMPT)

	ProprioInput = Union[
	np.ndarray, "torch.Tensor", List[np.ndarray], List["torch.Tensor"]
	]

	def get_special_token_ids(tokenizer):
	ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
	assert len(ids) == len(EXTRA_TOKENS)
	return {k: i for k, i in zip(EXTRA_TOKENS, ids)}


	class MolmoTextKwargs(TextKwargs, total=False):
	style: Optional[str]
	system_prompt: Optional[str]
	message_format: Optional[str]
	always_start_with_space: Optional[bool]
	sequence_length: Optional[int]


	class MolmoProcessorKwargs(ProcessingKwargs, total=False):
	text_kwargs: MolmoTextKwargs
	images_kwargs: MolmoImagesKwargs
	_defaults = {
	"images_kwargs": {
	"max_crops": 12,
	"overlap_margins": [4, 4],
	"base_image_input_size": [336, 336],
	"image_token_length_w": 12,
	"image_token_length_h": 12,
	"image_patch_size": 14,
	"image_padding_mask": True,
	},
	"text_kwargs": {
	"style": "long_caption",
	"system_prompt": "none",
	"message_format": "robot",
	"always_start_with_space": True,
	"sequence_length": 1536,
	"padding": False,
	},
	}


	class MolmoProcessor(ProcessorMixin):
	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "AutoImageProcessor"
	tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

	def __init__(self, image_processor: MolmoImageProcessor = None, tokenizer : AutoTokenizer = None, **kwargs):
	# self.image_processor = image_processor
	# self.tokenizer = tokenizer
	super().__init__(image_processor, tokenizer)
	self._special_tokens = None

	@property
	def special_token_ids(self):
	if self._special_tokens is None:
	self._special_tokens = get_special_token_ids(self.tokenizer)
	return self._special_tokens

	def get_tokens_input(self, prompt, message_format, always_start_with_space, num_proprio):
	if message_format == "none" or message_format is None:
	pass
	elif message_format == "role":
	prompt = "User: " + prompt + " Assistant:"
	elif message_format == "robot":
	# this adds proprio observations after the prompt
	prompt = "User: " + prompt + PROPRIO_PROMPT*num_proprio + " Assistant:"
	else:
	raise NotImplementedError(f"Message format {message_format} not implemented")

	if always_start_with_space:
	prompt = " " + prompt

	tokens = self.tokenizer.encode(prompt, add_special_tokens=False)

	return tokens

	def process(
	self,
	text: TextInput = None,
	images: ImageInput = None,
	proprio: ProprioInput = None,
	*,
	tokens: Optional[PreTokenizedInput] = None,
	**kwargs: Unpack[MolmoProcessorKwargs],
	):
	output_kwargs = self._merge_kwargs(
	MolmoProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)

	num_proprio = len(proprio) if proprio is not None else 0

	if tokens is None:
	tokens = self.get_tokens_input(
	text,
	output_kwargs["text_kwargs"]["message_format"],
	output_kwargs["text_kwargs"]["always_start_with_space"],
	num_proprio
	)

	image_token_id = self.special_token_ids[IMAGE_PROMPT]
	proprio_token_id = self.special_token_ids[PROPRIO_PROMPT]

	if images is not None:
	if not isinstance(images, (list, tuple)):
	images = [images]
	image_arrays = []
	for image in images:
	if isinstance(image, Image):
	image = image.convert("RGB")
	# Handle images with EXIF orientation tags, which PIL will ignore by default
	# https://github.com/python-pillow/Pillow/issues/4703
	img = ImageOps.exif_transpose(image)
	image_arrays.append(np.array(image))
	else:
	assert len(image.shape) == 3 and image.shape[-1] == 3
	image_arrays.append(image.astype(np.uint8))
	images = image_arrays
	# For now only support inserting images at the start
	image_idx = [-1]*len(images)
	else:
	image_idx = None

	sequence_length = output_kwargs["text_kwargs"]["sequence_length"]

	image_patch_token_id = self.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]
	image_col_token_id = self.special_token_ids[DEFAULT_IM_COL_TOKEN]
	image_start_token_id = self.special_token_ids[DEFAULT_IM_START_TOKEN]
	image_end_token_id = self.special_token_ids[DEFAULT_IM_END_TOKEN]
	out = self.image_processor.multimodal_preprocess(
	images=images,
	image_idx=image_idx,
	tokens=np.asarray(tokens).astype(np.int32),
	sequence_length=sequence_length,
	image_patch_token_id=image_patch_token_id,
	image_col_token_id=image_col_token_id,
	image_start_token_id=image_start_token_id,
	image_end_token_id=image_end_token_id,
	**output_kwargs["images_kwargs"]
	)

	# Prepend BOS
	# qwen2 and olmo do not have a BOS, and instead use EOS as a generic seperator token.
	bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
	decoder_input_tokens = np.pad(out["input_ids"], [[1, 0]], constant_values=bos)
	out["input_ids"] = decoder_input_tokens
	if "image_input_idx" in out:
	# Shift patch mapping up by one since we added BOS
	image_input_idx = out["image_input_idx"]
	out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)

	proprio_idx = np.where(out["input_ids"] == proprio_token_id)[0]
	out["proprio_idx"] = proprio_idx

	for k, v in out.items():
	out[k] = torch.from_numpy(v)

	return out


	MolmoProcessor.register_for_auto_class()