112233 / app.py
reveseforward
all
417a05f
import streamlit as st
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import io
st.set_page_config(page_title="Qwen3-VL-8B Inference", layout="wide")
@st.cache_resource
def load_model(model_id):
"""Load model and processor with float16 optimization"""
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
return model, processor
def main():
st.title("πŸ–ΌοΈ Qwen3-VL-8B Vision-Language Model")
st.markdown("Upload an image and ask questions about it or provide instructions!")
# Model configuration
model_id = "reverseforward/qwenmeasurement" # Replace with your model ID
try:
model, processor = load_model(model_id)
except Exception as e:
st.error(f"Error loading model: {e}")
st.info("Make sure your model ID is correct and you have internet access to HuggingFace Hub")
return
# Create two columns for layout
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("πŸ“€ Upload Image")
uploaded_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png", "webp"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, use_column_width=True, caption="Uploaded Image")
else:
image = None
st.info("Please upload an image to continue")
with col2:
st.subheader("πŸ’¬ Input Text")
text_input = st.text_area(
"Ask a question or provide instructions about the image:",
placeholder="e.g., What objects are in this image? Describe them in detail.",
height=150
)
st.divider()
# Generate response
if st.button("πŸš€ Generate Response", type="primary"):
if image is None:
st.warning("Please upload an image first!")
elif not text_input.strip():
st.warning("Please enter a text prompt!")
else:
with st.spinner("Processing... This may take a moment"):
try:
# Prepare inputs
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text_input}
]
}
]
# Process with float16 for efficiency
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = processor(
text=text,
images=[image],
return_tensors="pt",
padding=True
)
# Move to GPU and use float16
inputs = {k: v.to(model.device).to(torch.float16) if v.dtype in [torch.float32, torch.float64] else v.to(model.device) for k, v in inputs.items()}
# Generate
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=1024,
temperature=0.7,
top_p=0.95
)
# Decode response
response = processor.decode(
output_ids[0][inputs["input_ids"].shape[1]:],
skip_special_tokens=True
)
st.success("βœ… Generation complete!")
st.subheader("πŸ“ Response")
st.write(response)
except Exception as e:
st.error(f"Error during generation: {e}")
st.info("Check your model configuration and GPU memory")
if __name__ == "__main__":
main()