Spaces:
Build error
Build error
| import streamlit as st | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| import io | |
| st.set_page_config(page_title="Qwen3-VL-8B Inference", layout="wide") | |
| def load_model(model_id): | |
| """Load model and processor with float16 optimization""" | |
| processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True | |
| ) | |
| return model, processor | |
| def main(): | |
| st.title("πΌοΈ Qwen3-VL-8B Vision-Language Model") | |
| st.markdown("Upload an image and ask questions about it or provide instructions!") | |
| # Model configuration | |
| model_id = "reverseforward/qwenmeasurement" # Replace with your model ID | |
| try: | |
| model, processor = load_model(model_id) | |
| except Exception as e: | |
| st.error(f"Error loading model: {e}") | |
| st.info("Make sure your model ID is correct and you have internet access to HuggingFace Hub") | |
| return | |
| # Create two columns for layout | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.subheader("π€ Upload Image") | |
| uploaded_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png", "webp"]) | |
| if uploaded_file is not None: | |
| image = Image.open(uploaded_file) | |
| st.image(image, use_column_width=True, caption="Uploaded Image") | |
| else: | |
| image = None | |
| st.info("Please upload an image to continue") | |
| with col2: | |
| st.subheader("π¬ Input Text") | |
| text_input = st.text_area( | |
| "Ask a question or provide instructions about the image:", | |
| placeholder="e.g., What objects are in this image? Describe them in detail.", | |
| height=150 | |
| ) | |
| st.divider() | |
| # Generate response | |
| if st.button("π Generate Response", type="primary"): | |
| if image is None: | |
| st.warning("Please upload an image first!") | |
| elif not text_input.strip(): | |
| st.warning("Please enter a text prompt!") | |
| else: | |
| with st.spinner("Processing... This may take a moment"): | |
| try: | |
| # Prepare inputs | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": text_input} | |
| ] | |
| } | |
| ] | |
| # Process with float16 for efficiency | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| inputs = processor( | |
| text=text, | |
| images=[image], | |
| return_tensors="pt", | |
| padding=True | |
| ) | |
| # Move to GPU and use float16 | |
| inputs = {k: v.to(model.device).to(torch.float16) if v.dtype in [torch.float32, torch.float64] else v.to(model.device) for k, v in inputs.items()} | |
| # Generate | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| temperature=0.7, | |
| top_p=0.95 | |
| ) | |
| # Decode response | |
| response = processor.decode( | |
| output_ids[0][inputs["input_ids"].shape[1]:], | |
| skip_special_tokens=True | |
| ) | |
| st.success("β Generation complete!") | |
| st.subheader("π Response") | |
| st.write(response) | |
| except Exception as e: | |
| st.error(f"Error during generation: {e}") | |
| st.info("Check your model configuration and GPU memory") | |
| if __name__ == "__main__": | |
| main() |