prithivMLmods commited on
Commit
91a6110
·
verified ·
1 Parent(s): ab576d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -119
app.py CHANGED
@@ -10,9 +10,7 @@ from typing import Iterable
10
  import gradio as gr
11
  import spaces
12
  import torch
13
- import numpy as np
14
  from PIL import Image, ImageOps
15
- import cv2
16
  import requests
17
 
18
  from transformers import (
@@ -192,24 +190,6 @@ def normalize_values(text, target_max=500):
192
  normalized_text = re.sub(pattern, process_match, text)
193
  return normalized_text
194
 
195
- def downsample_video(video_path):
196
- """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
197
- vidcap = cv2.VideoCapture(video_path)
198
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
199
- fps = vidcap.get(cv2.CAP_PROP_FPS)
200
- frames = []
201
- frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
202
- for i in frame_indices:
203
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
204
- success, image = vidcap.read()
205
- if success:
206
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
207
- pil_image = Image.fromarray(image)
208
- timestamp = round(i / fps, 2)
209
- frames.append((pil_image, timestamp))
210
- vidcap.release()
211
- return frames
212
-
213
  @spaces.GPU
214
  def generate_image(model_name: str, text: str, image: Image.Image,
215
  max_new_tokens: int = 1024,
@@ -286,84 +266,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
286
  else:
287
  yield buffer, cleaned_output
288
 
289
- @spaces.GPU
290
- def generate_video(model_name: str, text: str, video_path: str,
291
- max_new_tokens: int = 1024,
292
- temperature: float = 0.6,
293
- top_p: float = 0.9,
294
- top_k: int = 50,
295
- repetition_penalty: float = 1.2):
296
- """Generate responses for video input using the selected model."""
297
- if model_name == "Nanonets-OCR-s":
298
- processor, model = processor_m, model_m
299
- elif model_name == "MonkeyOCR-Recognition":
300
- processor, model = processor_g, model_g
301
- elif model_name == "SmolDocling-256M-preview":
302
- processor, model = processor_x, model_x
303
- elif model_name == "Typhoon-OCR-7B":
304
- processor, model = processor_l, model_l
305
- elif model_name == "Thyme-RL":
306
- processor, model = processor_n, model_n
307
- else:
308
- yield "Invalid model selected.", "Invalid model selected."
309
- return
310
-
311
- if video_path is None:
312
- yield "Please upload a video.", "Please upload a video."
313
- return
314
-
315
- frames = downsample_video(video_path)
316
- images = [frame for frame, _ in frames]
317
-
318
- if model_name == "SmolDocling-256M-preview":
319
- if "OTSL" in text or "code" in text:
320
- images = [add_random_padding(img) for img in images]
321
- if "OCR at text at" in text or "Identify element" in text or "formula" in text:
322
- text = normalize_values(text, target_max=500)
323
-
324
- messages = [
325
- {
326
- "role": "user",
327
- "content": [{"type": "image"} for _ in images] + [
328
- {"type": "text", "text": text}
329
- ]
330
- }
331
- ]
332
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
333
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
334
-
335
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
336
- generation_kwargs = {
337
- **inputs,
338
- "streamer": streamer,
339
- "max_new_tokens": max_new_tokens,
340
- "temperature": temperature,
341
- "top_p": top_p,
342
- "top_k": top_k,
343
- "repetition_penalty": repetition_penalty,
344
- }
345
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
346
- thread.start()
347
-
348
- buffer = ""
349
- for new_text in streamer:
350
- buffer += new_text.replace("<|im_end|>", "")
351
- yield buffer, buffer
352
-
353
- if model_name == "SmolDocling-256M-preview":
354
- cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
355
- if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
356
- if "<chart>" in cleaned_output:
357
- cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
358
- cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
359
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
360
- doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
361
- markdown_output = doc.export_to_markdown()
362
- yield buffer, markdown_output
363
- else:
364
- yield buffer, cleaned_output
365
-
366
- # Define examples for image and video inference
367
  image_examples = [
368
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
369
  ["Describe the image!", "images/8.png"],
@@ -372,26 +275,16 @@ image_examples = [
372
  ["Convert this page to docling", "images/3.png"],
373
  ]
374
 
375
- video_examples = [
376
- ["Explain the video in detail.", "videos/1.mp4"],
377
- ["Explain the video in detail.", "videos/2.mp4"]
378
- ]
379
-
380
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
381
  gr.Markdown("# **Multimodal OCR2**", elem_id="main-title")
382
  with gr.Row():
383
  with gr.Column(scale=2):
384
- with gr.Tabs():
385
- with gr.TabItem("Image Inference"):
386
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
387
- image_upload = gr.Image(type="pil", label="Upload Image", height=290)
388
- image_submit = gr.Button("Submit", variant="primary")
389
- gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
390
- with gr.TabItem("Video Inference"):
391
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
392
- video_upload = gr.Video(label="Upload Video (<= 30s)", height=290)
393
- video_submit = gr.Button("Submit", variant="primary")
394
- gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
395
  with gr.Accordion("Advanced options", open=False):
396
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
397
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -416,11 +309,6 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
416
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
417
  outputs=[raw_output, formatted_output]
418
  )
419
- video_submit.click(
420
- fn=generate_video,
421
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
422
- outputs=[raw_output, formatted_output]
423
- )
424
 
425
  if __name__ == "__main__":
426
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
10
  import gradio as gr
11
  import spaces
12
  import torch
 
13
  from PIL import Image, ImageOps
 
14
  import requests
15
 
16
  from transformers import (
 
190
  normalized_text = re.sub(pattern, process_match, text)
191
  return normalized_text
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  @spaces.GPU
194
  def generate_image(model_name: str, text: str, image: Image.Image,
195
  max_new_tokens: int = 1024,
 
266
  else:
267
  yield buffer, cleaned_output
268
 
269
+ # Define examples for image inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  image_examples = [
271
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
272
  ["Describe the image!", "images/8.png"],
 
275
  ["Convert this page to docling", "images/3.png"],
276
  ]
277
 
 
 
 
 
 
278
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
279
  gr.Markdown("# **Multimodal OCR2**", elem_id="main-title")
280
  with gr.Row():
281
  with gr.Column(scale=2):
282
+ gr.Markdown("### Image Inference")
283
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
284
+ image_upload = gr.Image(type="pil", label="Upload Image", height=290)
285
+ image_submit = gr.Button("Submit", variant="primary")
286
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
287
+
 
 
 
 
 
288
  with gr.Accordion("Advanced options", open=False):
289
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
290
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
309
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
310
  outputs=[raw_output, formatted_output]
311
  )
 
 
 
 
 
312
 
313
  if __name__ == "__main__":
314
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)