File size: 4,441 Bytes
ccdf329
080c7c9
 
 
 
ccdf329
080c7c9
ccdf329
080c7c9
 
 
 
 
 
 
 
 
 
 
ccdf329
080c7c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccdf329
080c7c9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import io

st.set_page_config(page_title="Qwen3-VL-8B Inference", layout="wide")

@st.cache_resource
def load_model(model_id):
    """Load model and processor with float16 optimization"""
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return model, processor

def main():
    st.title("πŸ–ΌοΈ Qwen3-VL-8B Vision-Language Model")
    st.markdown("Upload an image and ask questions about it or provide instructions!")
    
    # Model configuration
    model_id = "reverseforward/qwenmeasurement"  # Replace with your model ID
    
    try:
        model, processor = load_model(model_id)
    except Exception as e:
        st.error(f"Error loading model: {e}")
        st.info("Make sure your model ID is correct and you have internet access to HuggingFace Hub")
        return
    
    # Create two columns for layout
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.subheader("πŸ“€ Upload Image")
        uploaded_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png", "webp"])
        
        if uploaded_file is not None:
            image = Image.open(uploaded_file)
            st.image(image, use_column_width=True, caption="Uploaded Image")
        else:
            image = None
            st.info("Please upload an image to continue")
    
    with col2:
        st.subheader("πŸ’¬ Input Text")
        text_input = st.text_area(
            "Ask a question or provide instructions about the image:",
            placeholder="e.g., What objects are in this image? Describe them in detail.",
            height=150
        )
    
    st.divider()
    
    # Generate response
    if st.button("πŸš€ Generate Response", type="primary"):
        if image is None:
            st.warning("Please upload an image first!")
        elif not text_input.strip():
            st.warning("Please enter a text prompt!")
        else:
            with st.spinner("Processing... This may take a moment"):
                try:
                    # Prepare inputs
                    messages = [
                        {
                            "role": "user",
                            "content": [
                                {"type": "image", "image": image},
                                {"type": "text", "text": text_input}
                            ]
                        }
                    ]
                    
                    # Process with float16 for efficiency
                    text = processor.apply_chat_template(
                        messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                    
                    inputs = processor(
                        text=text,
                        images=[image],
                        return_tensors="pt",
                        padding=True
                    )
                    
                    # Move to GPU and use float16
                    inputs = {k: v.to(model.device).to(torch.float16) if v.dtype in [torch.float32, torch.float64] else v.to(model.device) for k, v in inputs.items()}
                    
                    # Generate
                    with torch.no_grad():
                        output_ids = model.generate(
                            **inputs,
                            max_new_tokens=1024,
                            temperature=0.7,
                            top_p=0.95
                        )
                    
                    # Decode response
                    response = processor.decode(
                        output_ids[0][inputs["input_ids"].shape[1]:],
                        skip_special_tokens=True
                    )
                    
                    st.success("βœ… Generation complete!")
                    st.subheader("πŸ“ Response")
                    st.write(response)
                    
                except Exception as e:
                    st.error(f"Error during generation: {e}")
                    st.info("Check your model configuration and GPU memory")

if __name__ == "__main__":
    main()