Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -211,7 +211,7 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
|
|
| 211 |
|
| 212 |
buffer = ""
|
| 213 |
for new_text in streamer:
|
| 214 |
-
buffer += new_text.replace("", "")
|
| 215 |
yield buffer, buffer
|
| 216 |
|
| 217 |
if model_name == "SmolDocling-256M-preview":
|
|
@@ -221,11 +221,12 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
|
|
| 221 |
# For other models, the formatted output is just the cleaned buffer
|
| 222 |
yield buffer, buffer.strip()
|
| 223 |
|
| 224 |
-
def generate_image_wrapper(
|
| 225 |
-
yield from generate_response(
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
def generate_video_wrapper(*args):
|
| 228 |
-
yield from generate_response(*args, media_type="video")
|
| 229 |
|
| 230 |
# --- Examples ---
|
| 231 |
image_examples = [
|
|
@@ -389,6 +390,11 @@ css = """
|
|
| 389 |
|
| 390 |
# --- Gradio Interface ---
|
| 391 |
with gr.Blocks(css=css) as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
|
| 393 |
|
| 394 |
with gr.Row():
|
|
@@ -423,25 +429,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 423 |
elem_classes="doc-item"
|
| 424 |
)
|
| 425 |
|
| 426 |
-
# Examples
|
| 427 |
-
gr.Markdown("### Examples")
|
| 428 |
-
with gr.Row():
|
| 429 |
-
with gr.Column():
|
| 430 |
-
gr.Examples(
|
| 431 |
-
examples=image_examples,
|
| 432 |
-
inputs=[image_query, image_upload],
|
| 433 |
-
label="Image Examples"
|
| 434 |
-
)
|
| 435 |
-
with gr.Column():
|
| 436 |
-
gr.Examples(
|
| 437 |
-
examples=video_examples,
|
| 438 |
-
inputs=[video_query, video_upload],
|
| 439 |
-
label="Video Examples"
|
| 440 |
-
)
|
| 441 |
-
|
| 442 |
-
# File upload and controls
|
| 443 |
with gr.Group(elem_classes="upload-controls"):
|
| 444 |
-
# File upload area
|
| 445 |
with gr.Column(elem_classes="file-upload"):
|
| 446 |
file_upload = gr.File(
|
| 447 |
label="Upload files (image/video)",
|
|
@@ -449,7 +438,6 @@ with gr.Blocks(css=css) as demo:
|
|
| 449 |
elem_classes="file-upload"
|
| 450 |
)
|
| 451 |
|
| 452 |
-
# Model dropdown
|
| 453 |
model_dropdown = gr.Dropdown(
|
| 454 |
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
|
| 455 |
value="Nanonets-OCR-s",
|
|
@@ -457,8 +445,29 @@ with gr.Blocks(css=css) as demo:
|
|
| 457 |
elem_classes="model-dropdown"
|
| 458 |
)
|
| 459 |
|
| 460 |
-
# Submit button
|
| 461 |
submit_btn = gr.Button("→", size="lg", elem_classes="submit-btn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
|
| 463 |
# Advanced options (hidden by default)
|
| 464 |
with gr.Accordion("Advanced Options", open=False):
|
|
@@ -468,13 +477,6 @@ with gr.Blocks(css=css) as demo:
|
|
| 468 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
| 469 |
repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
| 470 |
|
| 471 |
-
# Query input
|
| 472 |
-
query_input = gr.Textbox(
|
| 473 |
-
label="Enter your query",
|
| 474 |
-
placeholder="Describe the image, extract text, convert to markdown...",
|
| 475 |
-
elem_classes="query-input"
|
| 476 |
-
)
|
| 477 |
-
|
| 478 |
# Output area
|
| 479 |
with gr.Group(elem_classes="output-area"):
|
| 480 |
gr.Markdown("### Output")
|
|
@@ -485,13 +487,6 @@ with gr.Blocks(css=css) as demo:
|
|
| 485 |
elem_classes="output-text"
|
| 486 |
)
|
| 487 |
|
| 488 |
-
# Initialize state variables
|
| 489 |
-
image_query = gr.State("")
|
| 490 |
-
video_query = gr.State("")
|
| 491 |
-
image_upload = gr.State(None)
|
| 492 |
-
video_upload = gr.State(None)
|
| 493 |
-
media_type = gr.State("image")
|
| 494 |
-
|
| 495 |
# --- Event Handlers ---
|
| 496 |
def handle_file_upload(file):
|
| 497 |
if file is None:
|
|
@@ -506,40 +501,34 @@ with gr.Blocks(css=css) as demo:
|
|
| 506 |
file_upload.change(
|
| 507 |
fn=handle_file_upload,
|
| 508 |
inputs=[file_upload],
|
| 509 |
-
outputs=[
|
| 510 |
)
|
| 511 |
|
| 512 |
-
def
|
| 513 |
-
|
| 514 |
-
return f"Using {model_name}"
|
| 515 |
-
|
| 516 |
-
model_dropdown.change(
|
| 517 |
-
fn=handle_model_selection,
|
| 518 |
-
inputs=[model_dropdown],
|
| 519 |
-
outputs=[]
|
| 520 |
-
)
|
| 521 |
-
|
| 522 |
-
def generate_wrapper(text, img, vid, model, max_tokens, temp, top_p, top_k, rep_penalty, m_type):
|
| 523 |
if m_type == "image" and img is not None:
|
| 524 |
-
|
| 525 |
elif m_type == "video" and vid is not None:
|
| 526 |
-
|
| 527 |
else:
|
| 528 |
-
yield "Please upload a valid file", "Please upload a valid file"
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
submit_btn.click(
|
| 531 |
fn=generate_wrapper,
|
| 532 |
inputs=[
|
| 533 |
query_input,
|
| 534 |
-
|
| 535 |
-
|
| 536 |
model_dropdown,
|
| 537 |
max_new_tokens,
|
| 538 |
temperature,
|
| 539 |
top_p,
|
| 540 |
top_k,
|
| 541 |
repetition_penalty,
|
| 542 |
-
|
| 543 |
],
|
| 544 |
outputs=[raw_output, raw_output]
|
| 545 |
)
|
|
|
|
| 211 |
|
| 212 |
buffer = ""
|
| 213 |
for new_text in streamer:
|
| 214 |
+
buffer += new_text.replace("<|end_of_text|>", "")
|
| 215 |
yield buffer, buffer
|
| 216 |
|
| 217 |
if model_name == "SmolDocling-256M-preview":
|
|
|
|
| 221 |
# For other models, the formatted output is just the cleaned buffer
|
| 222 |
yield buffer, buffer.strip()
|
| 223 |
|
| 224 |
+
def generate_image_wrapper(text, img, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
|
| 225 |
+
yield from generate_response(model, text, img, "image", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
|
| 226 |
+
|
| 227 |
+
def generate_video_wrapper(text, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
|
| 228 |
+
yield from generate_response(model, text, vid, "video", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
|
| 229 |
|
|
|
|
|
|
|
| 230 |
|
| 231 |
# --- Examples ---
|
| 232 |
image_examples = [
|
|
|
|
| 390 |
|
| 391 |
# --- Gradio Interface ---
|
| 392 |
with gr.Blocks(css=css) as demo:
|
| 393 |
+
# Initialize state variables that hold data
|
| 394 |
+
image_upload_state = gr.State(None)
|
| 395 |
+
video_upload_state = gr.State(None)
|
| 396 |
+
media_type_state = gr.State("image")
|
| 397 |
+
|
| 398 |
gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
|
| 399 |
|
| 400 |
with gr.Row():
|
|
|
|
| 429 |
elem_classes="doc-item"
|
| 430 |
)
|
| 431 |
|
| 432 |
+
# Define input components before they are referenced by gr.Examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
with gr.Group(elem_classes="upload-controls"):
|
|
|
|
| 434 |
with gr.Column(elem_classes="file-upload"):
|
| 435 |
file_upload = gr.File(
|
| 436 |
label="Upload files (image/video)",
|
|
|
|
| 438 |
elem_classes="file-upload"
|
| 439 |
)
|
| 440 |
|
|
|
|
| 441 |
model_dropdown = gr.Dropdown(
|
| 442 |
choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
|
| 443 |
value="Nanonets-OCR-s",
|
|
|
|
| 445 |
elem_classes="model-dropdown"
|
| 446 |
)
|
| 447 |
|
|
|
|
| 448 |
submit_btn = gr.Button("→", size="lg", elem_classes="submit-btn")
|
| 449 |
+
|
| 450 |
+
query_input = gr.Textbox(
|
| 451 |
+
label="Enter your query",
|
| 452 |
+
placeholder="Describe the image, extract text, convert to markdown...",
|
| 453 |
+
elem_classes="query-input"
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
# Examples section
|
| 457 |
+
gr.Markdown("### Examples")
|
| 458 |
+
with gr.Row():
|
| 459 |
+
with gr.Column():
|
| 460 |
+
gr.Examples(
|
| 461 |
+
examples=image_examples,
|
| 462 |
+
inputs=[query_input, file_upload], # Corrected inputs
|
| 463 |
+
label="Image Examples"
|
| 464 |
+
)
|
| 465 |
+
with gr.Column():
|
| 466 |
+
gr.Examples(
|
| 467 |
+
examples=video_examples,
|
| 468 |
+
inputs=[query_input, file_upload], # Corrected inputs
|
| 469 |
+
label="Video Examples"
|
| 470 |
+
)
|
| 471 |
|
| 472 |
# Advanced options (hidden by default)
|
| 473 |
with gr.Accordion("Advanced Options", open=False):
|
|
|
|
| 477 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
| 478 |
repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
# Output area
|
| 481 |
with gr.Group(elem_classes="output-area"):
|
| 482 |
gr.Markdown("### Output")
|
|
|
|
| 487 |
elem_classes="output-text"
|
| 488 |
)
|
| 489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
# --- Event Handlers ---
|
| 491 |
def handle_file_upload(file):
|
| 492 |
if file is None:
|
|
|
|
| 501 |
file_upload.change(
|
| 502 |
fn=handle_file_upload,
|
| 503 |
inputs=[file_upload],
|
| 504 |
+
outputs=[media_type_state, image_upload_state, video_upload_state] # Corrected outputs
|
| 505 |
)
|
| 506 |
|
| 507 |
+
def generate_wrapper(text, img, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty, m_type):
|
| 508 |
+
media_input = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
if m_type == "image" and img is not None:
|
| 510 |
+
media_input = img
|
| 511 |
elif m_type == "video" and vid is not None:
|
| 512 |
+
media_input = vid
|
| 513 |
else:
|
| 514 |
+
yield "Please upload a valid file.", "Please upload a valid file."
|
| 515 |
+
return
|
| 516 |
+
|
| 517 |
+
yield from generate_response(model, text, media_input, m_type, max_tokens, temp, top_p_val, top_k_val, rep_penalty)
|
| 518 |
|
| 519 |
submit_btn.click(
|
| 520 |
fn=generate_wrapper,
|
| 521 |
inputs=[
|
| 522 |
query_input,
|
| 523 |
+
image_upload_state, # Corrected input state
|
| 524 |
+
video_upload_state, # Corrected input state
|
| 525 |
model_dropdown,
|
| 526 |
max_new_tokens,
|
| 527 |
temperature,
|
| 528 |
top_p,
|
| 529 |
top_k,
|
| 530 |
repetition_penalty,
|
| 531 |
+
media_type_state # Corrected input state
|
| 532 |
],
|
| 533 |
outputs=[raw_output, raw_output]
|
| 534 |
)
|