From b1cd705f1b92c49e8e9bdd8886edbdd6d060a8fb Mon Sep 17 00:00:00 2001 From: "Thomas Dhome Casanova (from Dev Box)" Date: Wed, 22 Jan 2025 21:38:59 -0800 Subject: [PATCH] add in omniparser_url box --- demo/gradio/app.py | 16 +++++++++++++++- demo/gradio/computer_use_demo/loop.py | 13 ++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/demo/gradio/app.py b/demo/gradio/app.py index bc55797..94b478f 100644 --- a/demo/gradio/app.py +++ b/demo/gradio/app.py @@ -66,6 +66,8 @@ def setup_state(state): state["only_n_most_recent_images"] = 2 if 'chatbot_messages' not in state: state['chatbot_messages'] = [] + if "omniparser_url" not in state: + state["omniparser_url"] = "localhost:8000" async def main(state): """Render loop for Gradio""" @@ -207,7 +209,8 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - selected_screen=0 + selected_screen=0, + omniparser_url=state["omniparser_url"] ): if loop_msg is None: yield state['chatbot_messages'] @@ -271,6 +274,13 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: placeholder="Paste your API key here", interactive=True, ) + with gr.Row(): + omniparser_url = gr.Textbox( + label="OmniParser Base URL", + value="localhost:8000", + placeholder="Enter OmniParser base URL (e.g. localhost:8000)", + interactive=True + ) # hide_images = gr.Checkbox(label="Hide screenshots", value=False) with gr.Row(): @@ -341,10 +351,14 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: state["api_key"] = api_key_value state[f'{state["provider"]}_api_key'] = api_key_value + def update_omniparser_url(url_value, state): + state["omniparser_url"] = url_value + model.change(fn=update_model, inputs=[model, state], outputs=[provider, api_key]) only_n_images.change(fn=update_only_n_images, inputs=[only_n_images, state], outputs=None) provider.change(fn=update_provider, inputs=[provider, state], outputs=api_key) api_key.change(fn=update_api_key, inputs=[api_key, state], outputs=None) + omniparser_url.change(fn=update_omniparser_url, inputs=[omniparser_url, state], outputs=None) submit_button.click(process_input, [chat_input, state], chatbot) diff --git a/demo/gradio/computer_use_demo/loop.py b/demo/gradio/computer_use_demo/loop.py index 449e37e..3e4e3bf 100644 --- a/demo/gradio/computer_use_demo/loop.py +++ b/demo/gradio/computer_use_demo/loop.py @@ -46,16 +46,16 @@ def sampling_loop_sync( api_key: str, only_n_most_recent_images: int | None = 2, max_tokens: int = 4096, - selected_screen: int = 0 + selected_screen: int = 0, + omniparser_url: str ): """ Synchronous agentic sampling loop for the assistant/tool interaction of computer use. """ print('in sampling_loop_sync, model:', model) + omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None, + selected_screen=selected_screen,) if model == "claude-3-5-sonnet-20241022": - omniparser = OmniParser(url="http://localhost:8000/send_text/", - selected_screen=selected_screen,) - # Register Actor and Executor actor = AnthropicActor( model=model, @@ -75,11 +75,6 @@ def sampling_loop_sync( ) elif model == "omniparser + gpt-4o" or model == "omniparser + phi35v": - # omniparser = OmniParser(url="http://localhost:8000/send_text/", - # selected_screen=selected_screen,) - omniparser = OmniParser(url=None, - selected_screen=selected_screen,) - actor = VLMAgent( model=model, provider=provider,