File size: 4,677 Bytes
16b2893
 
 
 
 
 
 
 
 
68ebecc
 
39eba3e
16b2893
 
 
39eba3e
 
16b2893
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import re

import gradio as gr

from model import ToyModel

"""
Model specification
"""
vision_model_path = 'openai/clip-vit-base-patch32'
language_model_path = 'openai-community/gpt2'
model = ToyModel(vision_model_path, language_model_path)


def chat(image_input, text_input):
    text_output = model.chat(image_input, text_input)
    return image_input, text_output


"""
Gradio 
"""


def gradio_taskselect(idx):
    prompt_list = [
        '',
        '[grounding] describe this image in detail',
        '[refer] ',
        '[detection] ',
        '[identify] what is this ',
        '[vqa] '
    ]
    instruct_list = [
        '**Hint:** Type in whatever you want',
        '**Hint:** Send the command to generate a grounded image description',
        '**Hint:** Type in a phrase about an object in the image and send the command',
        '**Hint:** Type in a caption or phrase, and see object locations in the image',
        '**Hint:** Draw a bounding box on the uploaded image then send the command. Click the "clear" botton on the '
        'top right of the image before redraw',
        '**Hint:** Send a question to get a short answer',
    ]
    return prompt_list[idx], instruct_list[idx]


title = """<h1 align="center">RS-Visual Perception Demo</h1>"""
description = 'Welcome to Our RS-Visual Perception Demo!'

introduction = '''
For Abilities Involving Visual Grounding:
1. Grounding: CLICK **Send** to generate a grounded image description.
2. Refer: Input a referring object and CLICK **Send**.
3. Detection: Write a caption or phrase, and CLICK **Send**.
4. Identify: Draw the bounding box on the uploaded image window and CLICK **Send** to generate the bounding box. (CLICK "clear" button before re-drawing next time).
5. VQA: Input a visual question and CLICK **Send**.
6. No Tag: Input whatever you want and CLICK **Send** without any tagging
You can also simply chat in free form!
'''

with gr.Blocks() as demo:
    gr.Markdown(title)
    gr.Markdown(description)

    with gr.Row():
        with gr.Column(scale=0.5):
            image_input = gr.Image(type="pil", label="Input Image")

            temperature = gr.Slider(
                minimum=0.1,
                maximum=1.5,
                value=0.6,
                step=0.1,
                interactive=True,
                label="Temperature",
            )

            dataset = gr.Dataset(
                components=[gr.Textbox(visible=False)],
                samples=[['No Tag'], ['Grounding'], ['Refer'], ['Detection'], ['Identify'], ['VQA']],
                type="index",
                label='Task Shortcuts',
            )
            task_inst = gr.Markdown('**Hint:** Upload your image and chat')
            text_input = gr.Textbox(label='Input text', placeholder='Upload your image and chat', interactive=True, )
            submit_button = gr.Button("Submit", variant='primary', size='sm', scale=1)

            gr.Markdown(introduction)

        with gr.Column():
            image_output = gr.Image(type="pil", label='Output image')
            text_output = gr.Textbox(label='Output text', interactive=True)

    with gr.Row():
        with gr.Column():
            gr.Examples(examples=[
                ["examples_v2/office.jpg", "[grounding] describe this image in detail"],
                ["examples_v2/sofa.jpg", "[detection] sofas"],
                ["examples_v2/2000x1372_wmkn_0012149409555.jpg", "[refer] the world cup"],
                ["examples_v2/KFC-20-for-20-Nuggets.jpg", "[identify] what is this {<4><50><30><65>}"],
            ], inputs=[image_input, text_input], fn=chat,
                outputs=[image_output, text_output])
        with gr.Column():
            gr.Examples(examples=[
                ["examples_v2/glip_test.jpg", "[vqa] where should I hide in this room when playing hide and seek"],
                ["examples_v2/float.png", "Please write a poem about the image"],
                ["examples_v2/thief.png", "Is the weapon fateful"],
                ["examples_v2/cockdial.png", "What might happen in this image in the next second"],
            ], inputs=[image_input, text_input], fn=chat,
                outputs=[image_output, text_output])

    dataset.click(
        gradio_taskselect,
        inputs=[dataset],
        outputs=[text_input, task_inst],
        show_progress="hidden",
        postprocess=False,
        queue=False,
    )

    text_input.submit(
        chat,
        inputs=[image_input, text_input],
        outputs=[image_output, text_output],
    )

    submit_button.click(
        chat,
        inputs=[image_input, text_input],
        outputs=[image_output, text_output],
    )

demo.launch()