diff --git a/.gitignore b/.gitignore index 2ad14a7..40c4d8c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ weights/icon_detect_v1_5_2/ .gradio __pycache__/ debug.ipynb +util/__pycache__/ diff --git a/demo/gradio/app.py b/demo/gradio/app.py index 842738f..d005c49 100644 --- a/demo/gradio/app.py +++ b/demo/gradio/app.py @@ -1,5 +1,6 @@ """ Entrypoint for Gradio, see https://gradio.app/ +python app.py --windows_host_url xxxx:8006/ --omniparser_host_url localhost:8000 """ import platform @@ -15,6 +16,7 @@ from pathlib import Path from typing import cast, Dict from PIL import Image import socket +import argparse import gradio as gr from anthropic import APIResponse @@ -39,6 +41,18 @@ Welcome to the OmniParser+X Demo! X = [GPT-4o/4o-mini, Claude, Phi, Llama]. Let Type a message and press submit to start OmniParser+X. Press the trash icon in the chat to clear the message history. ''' +def parse_arguments(): + parser = argparse.ArgumentParser(description="Gradio App") + parser.add_argument("--windows_host_url", type=str, default='GCRSANDBOX336.redmond.corp.microsoft.com:8006/') # http://gcrsandbox336.redmond.corp.microsoft.com/ + parser.add_argument("--omniparser_host_url", type=str, default="localhost:8000") + return parser.parse_args() +args = parse_arguments() +windows_host_url = args.windows_host_url +omniparser_host_url = args.omniparser_host_url +print(f"Windows host URL: {windows_host_url}") +print(f"OmniParser host URL: {omniparser_host_url}") + + class Sender(StrEnum): USER = "user" BOT = "assistant" @@ -68,8 +82,8 @@ def setup_state(state): state["only_n_most_recent_images"] = 2 if 'chatbot_messages' not in state: state['chatbot_messages'] = [] - if "omniparser_url" not in state: - state["omniparser_url"] = "localhost:8000" + # if "omniparser_url" not in state: + # state["omniparser_url"] = "localhost:8000" async def main(state): """Render loop for Gradio""" @@ -211,7 +225,7 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - omniparser_url=state["omniparser_url"] + omniparser_url=omniparser_host_url #state["omniparser_url"] ): if loop_msg is None: yield state['chatbot_messages'] @@ -275,13 +289,13 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: placeholder="Paste your API key here", interactive=True, ) - with gr.Row(): - omniparser_url = gr.Textbox( - label="OmniParser Base URL", - value="localhost:8000", - placeholder="Enter OmniParser base URL (e.g. localhost:8000)", - interactive=True - ) + # with gr.Row(): + # omniparser_url = gr.Textbox( + # label="OmniParser Base URL", + # value="localhost:8000", + # placeholder="Enter OmniParser base URL (e.g. localhost:8000)", + # interactive=True + # ) # hide_images = gr.Checkbox(label="Hide screenshots", value=False) with gr.Row(): @@ -294,11 +308,20 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: with gr.Column(scale=1): chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580) with gr.Column(scale=3): - iframe = gr.HTML( - f'', - container=False, - elem_classes="no-padding" - ) + if not windows_host_url: + iframe = gr.HTML( + f'', + container=False, + elem_classes="no-padding" + ) + else: + # machine_fqdn = socket.getfqdn() + # print('machine_fqdn:', machine_fqdn) + iframe = gr.HTML( + f'', + container=False, + elem_classes="no-padding" + ) def update_model(model_selection, state): state["model"] = model_selection @@ -350,8 +373,8 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: state["api_key"] = api_key_value state[f'{state["provider"]}_api_key'] = api_key_value - def update_omniparser_url(url_value, state): - state["omniparser_url"] = url_value + # def update_omniparser_url(url_value, state): + # state["omniparser_url"] = url_value def clear_chat(state): # Reset message-related state @@ -365,7 +388,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo: only_n_images.change(fn=update_only_n_images, inputs=[only_n_images, state], outputs=None) provider.change(fn=update_provider, inputs=[provider, state], outputs=api_key) api_key.change(fn=update_api_key, inputs=[api_key, state], outputs=None) - omniparser_url.change(fn=update_omniparser_url, inputs=[omniparser_url, state], outputs=None) + # omniparser_url.change(fn=update_omniparser_url, inputs=[omniparser_url, state], outputs=None) chatbot.clear(fn=clear_chat, inputs=[state], outputs=[chatbot]) submit_button.click(process_input, [chat_input, state], chatbot) diff --git a/demo/gradio/computer_use_demo/loop.py b/demo/gradio/computer_use_demo/loop.py index c4129e5..c400ec4 100644 --- a/demo/gradio/computer_use_demo/loop.py +++ b/demo/gradio/computer_use_demo/loop.py @@ -52,7 +52,7 @@ def sampling_loop_sync( Synchronous agentic sampling loop for the assistant/tool interaction of computer use. """ print('in sampling_loop_sync, model:', model) - omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url and omniparser_url != "localhost:8000" else None) + omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None) if model == "claude-3-5-sonnet-20241022": # Register Actor and Executor actor = AnthropicActor( diff --git a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py index 92a7901..35f57c9 100644 --- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py +++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py @@ -132,7 +132,15 @@ class VLMAgent: sender="bot") self.output_callback(f'Set of Marks Screenshot for {colorful_text_vlm}:\n', sender="bot") screen_info = str(parsed_screen['screen_info']) - self.output_callback(f'Screen Info for {colorful_text_vlm}:\n{screen_info}', sender="bot") + # self.output_callback(f'Screen Info for {colorful_text_vlm}:\n{screen_info}', sender="bot") + self.output_callback( + f'
' + f' Screen Info for {colorful_text_vlm}' + f'
{screen_info}
' + f'
', + sender="bot" + ) + screenshot_uuid = parsed_screen['screenshot_uuid'] screen_width, screen_height = parsed_screen['width'], parsed_screen['height'] @@ -155,7 +163,7 @@ class VLMAgent: planner_messages[-1]["content"].append(f"{OUTPUT_DIR}/screenshot_{screenshot_uuid}.png") planner_messages[-1]["content"].append(f"{OUTPUT_DIR}/screenshot_som_{screenshot_uuid}.png") - print(f"Sending messages to VLMPlanner : {planner_messages}") + # print(f"Sending messages to VLMPlanner : {planner_messages}") start = time.time() if "gpt" in self.model: vlm_response, token_usage = run_oai_interleaved( diff --git a/demo/remote_request.py b/demo/remote_request.py index 84dd3a9..7efafc3 100644 --- a/demo/remote_request.py +++ b/demo/remote_request.py @@ -1,4 +1,7 @@ # uvicorn remote_request:app --host 0.0.0.0 --port 8000 --reload +''' +python -m remote_request --som_model_path ../weights/icon_detect_v1_5/model_v1_5.pt --caption_model_name florence2 --caption_model_path ../weights/icon_detect_v1_5/model_v1_5.pt --device cuda --BOX_TRESHOLD 0.05 +''' import sys import os @@ -12,14 +15,31 @@ import base64 import io from fastapi import FastAPI from pydantic import BaseModel +import argparse -config = { - 'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt', - 'device': 'cpu', - 'caption_model_name': 'florence2', - 'caption_model_path': '../weights/icon_caption_florence', - 'BOX_TRESHOLD': 0.05 -} +def parse_arguments(): + parser = argparse.ArgumentParser(description='Omniparser API') + parser.add_argument('--som_model_path', type=str, default='../weights/icon_detect_v1_5/model_v1_5.pt', help='Path to the som model') + parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model') + parser.add_argument('--caption_model_path', type=str, default='../weights/icon_caption_florence', help='Path to the caption model') + parser.add_argument('--device', type=str, default='cpu', help='Device to run the model') + parser.add_argument('--BOX_TRESHOLD', type=float, default=0.05, help='Threshold for box detection') + parser.add_argument('--host', type=str, default='0.0.0.0', help='Host for the API') + parser.add_argument('--port', type=int, default=8000, help='Port for the API') + args = parser.parse_args() + return args + +args = parse_arguments() +config = vars(args) + + +# config = { +# 'som_model_path': '../weights/icon_detect_v1_5/model_v1_5.pt', +# 'device': 'cpu', +# 'caption_model_name': 'florence2', +# 'caption_model_path': '../weights/icon_caption_florence', +# 'BOX_TRESHOLD': 0.05 +# } class Omniparser(object): @@ -74,4 +94,9 @@ async def send_text(item: Item): @app.get("/") async def root(): - return {"message": "Omniparser API ready"} \ No newline at end of file + return {"message": "Omniparser API ready"} + + +if __name__ == "__main__": + import uvicorn + uvicorn.run("remote_request:app", host=args.host, port=args.port, reload=True) \ No newline at end of file diff --git a/util/__pycache__/__init__.cpython-312.pyc b/util/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 9b6f52f..0000000 Binary files a/util/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/util/__pycache__/__init__.cpython-39.pyc b/util/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 9cb17e3..0000000 Binary files a/util/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/util/__pycache__/action_matching.cpython-39.pyc b/util/__pycache__/action_matching.cpython-39.pyc deleted file mode 100644 index aac3447..0000000 Binary files a/util/__pycache__/action_matching.cpython-39.pyc and /dev/null differ diff --git a/util/__pycache__/box_annotator.cpython-312.pyc b/util/__pycache__/box_annotator.cpython-312.pyc deleted file mode 100644 index 0b3df34..0000000 Binary files a/util/__pycache__/box_annotator.cpython-312.pyc and /dev/null differ diff --git a/util/__pycache__/box_annotator.cpython-39.pyc b/util/__pycache__/box_annotator.cpython-39.pyc deleted file mode 100644 index c091943..0000000 Binary files a/util/__pycache__/box_annotator.cpython-39.pyc and /dev/null differ