Update app.py
Browse files
app.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
import torch
|
| 3 |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
| 4 |
from qwen_vl_utils import process_vision_info
|
| 5 |
from PIL import Image
|
| 6 |
import cv2
|
| 7 |
import numpy as np
|
|
|
|
| 8 |
import spaces
|
| 9 |
|
| 10 |
# Load the model and processor
|
| 11 |
-
@spaces.GPU
|
| 12 |
def load_model():
|
| 13 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 14 |
"Qwen/Qwen2-VL-2B-Instruct",
|
| 15 |
torch_dtype=torch.float16
|
| 16 |
-
)
|
| 17 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 18 |
return model, processor
|
| 19 |
|
|
@@ -41,7 +40,8 @@ def process_image(image, prompt):
|
|
| 41 |
padding=True,
|
| 42 |
return_tensors="pt",
|
| 43 |
).to("cuda")
|
| 44 |
-
|
|
|
|
| 45 |
with torch.no_grad():
|
| 46 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
| 47 |
generated_ids_trimmed = [
|
|
@@ -54,8 +54,8 @@ def process_image(image, prompt):
|
|
| 54 |
return output_text[0]
|
| 55 |
|
| 56 |
@spaces.GPU
|
| 57 |
-
def process_video(
|
| 58 |
-
cap = cv2.VideoCapture(
|
| 59 |
frames = []
|
| 60 |
frame_count = 0
|
| 61 |
|
|
@@ -100,9 +100,7 @@ def process_video(video_path, prompt, max_frames=16, frame_interval=30, max_reso
|
|
| 100 |
return_tensors="pt",
|
| 101 |
).to("cuda")
|
| 102 |
|
| 103 |
-
|
| 104 |
-
torch.cuda.empty_cache()
|
| 105 |
-
|
| 106 |
with torch.no_grad():
|
| 107 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
| 108 |
generated_ids_trimmed = [
|
|
@@ -119,22 +117,24 @@ def process_content(content, prompt):
|
|
| 119 |
if content is None:
|
| 120 |
return "Please upload an image or video file."
|
| 121 |
|
| 122 |
-
if
|
| 123 |
-
return process_image(content, prompt)
|
| 124 |
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
|
| 125 |
-
return process_video(content
|
| 126 |
else:
|
| 127 |
return "Unsupported file type. Please provide an image or video file."
|
| 128 |
|
|
|
|
| 129 |
iface = gr.Interface(
|
| 130 |
fn=process_content,
|
| 131 |
inputs=[
|
| 132 |
-
gr.File(label="Upload Image or Video"
|
| 133 |
-
gr.Textbox(label="Enter your prompt
|
| 134 |
],
|
| 135 |
outputs="text",
|
| 136 |
-
title="Image and Video Description
|
| 137 |
-
description="Upload an image or video and
|
| 138 |
)
|
| 139 |
|
| 140 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
| 3 |
from qwen_vl_utils import process_vision_info
|
| 4 |
from PIL import Image
|
| 5 |
import cv2
|
| 6 |
import numpy as np
|
| 7 |
+
import gradio as gr
|
| 8 |
import spaces
|
| 9 |
|
| 10 |
# Load the model and processor
|
|
|
|
| 11 |
def load_model():
|
| 12 |
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 13 |
"Qwen/Qwen2-VL-2B-Instruct",
|
| 14 |
torch_dtype=torch.float16
|
| 15 |
+
)
|
| 16 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
| 17 |
return model, processor
|
| 18 |
|
|
|
|
| 40 |
padding=True,
|
| 41 |
return_tensors="pt",
|
| 42 |
).to("cuda")
|
| 43 |
+
|
| 44 |
+
model.to("cuda")
|
| 45 |
with torch.no_grad():
|
| 46 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
| 47 |
generated_ids_trimmed = [
|
|
|
|
| 54 |
return output_text[0]
|
| 55 |
|
| 56 |
@spaces.GPU
|
| 57 |
+
def process_video(video, prompt, max_frames=16, frame_interval=30, max_resolution=224):
|
| 58 |
+
cap = cv2.VideoCapture(video.name)
|
| 59 |
frames = []
|
| 60 |
frame_count = 0
|
| 61 |
|
|
|
|
| 100 |
return_tensors="pt",
|
| 101 |
).to("cuda")
|
| 102 |
|
| 103 |
+
model.to("cuda")
|
|
|
|
|
|
|
| 104 |
with torch.no_grad():
|
| 105 |
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
| 106 |
generated_ids_trimmed = [
|
|
|
|
| 117 |
if content is None:
|
| 118 |
return "Please upload an image or video file."
|
| 119 |
|
| 120 |
+
if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
|
| 121 |
+
return process_image(Image.open(content.name), prompt)
|
| 122 |
elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
|
| 123 |
+
return process_video(content, prompt)
|
| 124 |
else:
|
| 125 |
return "Unsupported file type. Please provide an image or video file."
|
| 126 |
|
| 127 |
+
# Gradio interface
|
| 128 |
iface = gr.Interface(
|
| 129 |
fn=process_content,
|
| 130 |
inputs=[
|
| 131 |
+
gr.File(label="Upload Image or Video"),
|
| 132 |
+
gr.Textbox(label="Enter your prompt")
|
| 133 |
],
|
| 134 |
outputs="text",
|
| 135 |
+
title="Image and Video Description",
|
| 136 |
+
description="Upload an image or video and enter a prompt to get a description or analysis.",
|
| 137 |
)
|
| 138 |
|
| 139 |
+
if __name__ == "__main__":
|
| 140 |
+
iface.launch()
|