Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 25

Commit

43c3626

verified ·

1 Parent(s): 9916e82

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -57

app.py CHANGED Viewed

@@ -211,7 +211,7 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
     buffer = ""
     for new_text in streamer:
-        buffer += new_text.replace("", "")
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
@@ -221,11 +221,12 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
         # For other models, the formatted output is just the cleaned buffer
         yield buffer, buffer.strip()
-def generate_image_wrapper(*args):
-    yield from generate_response(*args, media_type="image")
-def generate_video_wrapper(*args):
-    yield from generate_response(*args, media_type="video")
 # --- Examples ---
 image_examples = [
@@ -389,6 +390,11 @@ css = """
 # --- Gradio Interface ---
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
@@ -423,25 +429,8 @@ with gr.Blocks(css=css) as demo:
                                 elem_classes="doc-item"
                             )
-                # Examples section
-                gr.Markdown("### Examples")
-                with gr.Row():
-                    with gr.Column():
-                        gr.Examples(
-                            examples=image_examples,
-                            inputs=[image_query, image_upload],
-                            label="Image Examples"
-                        )
-                    with gr.Column():
-                        gr.Examples(
-                            examples=video_examples,
-                            inputs=[video_query, video_upload],
-                            label="Video Examples"
-                        )
-                # File upload and controls
                 with gr.Group(elem_classes="upload-controls"):
-                    # File upload area
                     with gr.Column(elem_classes="file-upload"):
                         file_upload = gr.File(
                             label="Upload files (image/video)",
@@ -449,7 +438,6 @@ with gr.Blocks(css=css) as demo:
                             elem_classes="file-upload"
                         )
-                    # Model dropdown
                     model_dropdown = gr.Dropdown(
                         choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
                         value="Nanonets-OCR-s",
@@ -457,8 +445,29 @@ with gr.Blocks(css=css) as demo:
                         elem_classes="model-dropdown"
                     )
-                    # Submit button
                     submit_btn = gr.Button("→", size="lg", elem_classes="submit-btn")
                 # Advanced options (hidden by default)
                 with gr.Accordion("Advanced Options", open=False):
@@ -468,13 +477,6 @@ with gr.Blocks(css=css) as demo:
                     top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                     repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-                # Query input
-                query_input = gr.Textbox(
-                    label="Enter your query",
-                    placeholder="Describe the image, extract text, convert to markdown...",
-                    elem_classes="query-input"
-                )
                 # Output area
                 with gr.Group(elem_classes="output-area"):
                     gr.Markdown("### Output")
@@ -485,13 +487,6 @@ with gr.Blocks(css=css) as demo:
                         elem_classes="output-text"
                     )
-    # Initialize state variables
-    image_query = gr.State("")
-    video_query = gr.State("")
-    image_upload = gr.State(None)
-    video_upload = gr.State(None)
-    media_type = gr.State("image")
     # --- Event Handlers ---
     def handle_file_upload(file):
         if file is None:
@@ -506,40 +501,34 @@ with gr.Blocks(css=css) as demo:
     file_upload.change(
         fn=handle_file_upload,
         inputs=[file_upload],
-        outputs=[media_type, image_upload, video_upload]
     )
-    def handle_model_selection(model_name):
-        # This function could be used to update the UI based on model selection
-        return f"Using {model_name}"
-    model_dropdown.change(
-        fn=handle_model_selection,
-        inputs=[model_dropdown],
-        outputs=[]
-    )
-    def generate_wrapper(text, img, vid, model, max_tokens, temp, top_p, top_k, rep_penalty, m_type):
         if m_type == "image" and img is not None:
-            yield from generate_image_wrapper(text, img, model, max_tokens, temp, top_p, top_k, rep_penalty)
         elif m_type == "video" and vid is not None:
-            yield from generate_video_wrapper(text, vid, model, max_tokens, temp, top_p, top_k, rep_penalty)
         else:
-            yield "Please upload a valid file", "Please upload a valid file"
     submit_btn.click(
         fn=generate_wrapper,
         inputs=[
             query_input,
-            image_upload,
-            video_upload,
             model_dropdown,
             max_new_tokens,
             temperature,
             top_p,
             top_k,
             repetition_penalty,
-            media_type
         ],
         outputs=[raw_output, raw_output]
     )

     buffer = ""
     for new_text in streamer:
+        buffer += new_text.replace("<|end_of_text|>", "")
         yield buffer, buffer
     if model_name == "SmolDocling-256M-preview":
         # For other models, the formatted output is just the cleaned buffer
         yield buffer, buffer.strip()
+def generate_image_wrapper(text, img, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
+    yield from generate_response(model, text, img, "image", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
+def generate_video_wrapper(text, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
+    yield from generate_response(model, text, vid, "video", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
 # --- Examples ---
 image_examples = [
 # --- Gradio Interface ---
 with gr.Blocks(css=css) as demo:
+    # Initialize state variables that hold data
+    image_upload_state = gr.State(None)
+    video_upload_state = gr.State(None)
+    media_type_state = gr.State("image")
     gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
                                 elem_classes="doc-item"
                             )
+                # Define input components before they are referenced by gr.Examples
                 with gr.Group(elem_classes="upload-controls"):
                     with gr.Column(elem_classes="file-upload"):
                         file_upload = gr.File(
                             label="Upload files (image/video)",
                             elem_classes="file-upload"
                         )
                     model_dropdown = gr.Dropdown(
                         choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
                         value="Nanonets-OCR-s",
                         elem_classes="model-dropdown"
                     )
                     submit_btn = gr.Button("→", size="lg", elem_classes="submit-btn")
+                query_input = gr.Textbox(
+                    label="Enter your query",
+                    placeholder="Describe the image, extract text, convert to markdown...",
+                    elem_classes="query-input"
+                )
+                # Examples section
+                gr.Markdown("### Examples")
+                with gr.Row():
+                    with gr.Column():
+                        gr.Examples(
+                            examples=image_examples,
+                            inputs=[query_input, file_upload], # Corrected inputs
+                            label="Image Examples"
+                        )
+                    with gr.Column():
+                        gr.Examples(
+                            examples=video_examples,
+                            inputs=[query_input, file_upload], # Corrected inputs
+                            label="Video Examples"
+                        )
                 # Advanced options (hidden by default)
                 with gr.Accordion("Advanced Options", open=False):
                     top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                     repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
                 # Output area
                 with gr.Group(elem_classes="output-area"):
                     gr.Markdown("### Output")
                         elem_classes="output-text"
                     )
     # --- Event Handlers ---
     def handle_file_upload(file):
         if file is None:
     file_upload.change(
         fn=handle_file_upload,
         inputs=[file_upload],
+        outputs=[media_type_state, image_upload_state, video_upload_state] # Corrected outputs
     )
+    def generate_wrapper(text, img, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty, m_type):
+        media_input = None
         if m_type == "image" and img is not None:
+            media_input = img
         elif m_type == "video" and vid is not None:
+            media_input = vid
         else:
+            yield "Please upload a valid file.", "Please upload a valid file."
+            return
+        yield from generate_response(model, text, media_input, m_type, max_tokens, temp, top_p_val, top_k_val, rep_penalty)
     submit_btn.click(
         fn=generate_wrapper,
         inputs=[
             query_input,
+            image_upload_state, # Corrected input state
+            video_upload_state, # Corrected input state
             model_dropdown,
             max_new_tokens,
             temperature,
             top_p,
             top_k,
             repetition_penalty,
+            media_type_state # Corrected input state
         ],
         outputs=[raw_output, raw_output]
     )