From 5171b092483ab3e74ca50b9357e225f9f3571f18 Mon Sep 17 00:00:00 2001 From: yadong-lu Date: Wed, 26 Mar 2025 13:33:44 -0700 Subject: [PATCH] supprt local data logging --- README.md | 2 + .../agent/vlm_agent_with_orchestrator.py | 46 +++++++++++++------ omnitool/gradio/app_new.py | 25 +++++----- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 91f706d..ee19547 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@

Logo

+ [![arXiv](https://img.shields.io/badge/Paper-green)](https://arxiv.org/abs/2408.00203) [![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) @@ -12,6 +13,7 @@ **OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface. ## News +- [2025/3] We support local logging of trajecotry so that you can use OmniParser+OmniTool to build training data pipeline for your favorate agent in your domain. [Documentation WIP] - [2025/3] We are gradually adding multi agents orchstration and improving user interface in OmniTool for better experience. - [2025/2] We release OmniParser V2 [checkpoints](https://huggingface.co/microsoft/OmniParser-v2.0). [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EWXbVESKWo9Buu6OYCwg06wBeoM97C6EOTG6RjvWLEN1Qg?e=alnHGC) - [2025/2] We introduce OmniTool: Control a Windows 11 VM with OmniParser + your vision model of choice. OmniTool supports out of the box the following large language models - OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use. [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EehZ7RzY69ZHn-MeQHrnnR4BCj3by-cLLpUVlxMjF4O65Q?e=8LxMgX) diff --git a/omnitool/gradio/agent/vlm_agent_with_orchestrator.py b/omnitool/gradio/agent/vlm_agent_with_orchestrator.py index 650a8e3..74d554a 100644 --- a/omnitool/gradio/agent/vlm_agent_with_orchestrator.py +++ b/omnitool/gradio/agent/vlm_agent_with_orchestrator.py @@ -17,7 +17,7 @@ from agent.llm_utils.groqclient import run_groq_interleaved from agent.llm_utils.utils import is_image_path import time import re - +import os OUTPUT_DIR = "./tmp/outputs" ORCHESTRATOR_LEDGER_PROMPT = """ Recall we are working on the following request: @@ -73,7 +73,7 @@ class VLMOrchestratedAgent: max_tokens: int = 4096, only_n_most_recent_images: int | None = None, print_usage: bool = True, - save_folder: str = "./uploads", + save_folder: str = None, ): if model == "omniparser + gpt-4o" or model == "omniparser + gpt-4o-orchestrated": self.model = "gpt-4o-2024-11-20" @@ -95,22 +95,20 @@ class VLMOrchestratedAgent: self.max_tokens = max_tokens self.only_n_most_recent_images = only_n_most_recent_images self.output_callback = output_callback - self.save_folder = Path(save_folder).absolute() + self.save_folder = save_folder - # Create save folder if it doesn't exist - self.save_folder.mkdir(parents=True, exist_ok=True) - self.print_usage = print_usage self.total_token_usage = 0 self.total_cost = 0 self.step_count = 0 + self.plan, self.ledger = None, None self.system = '' def __call__(self, messages: list, parsed_screen: list[str, list, dict]): if self.step_count == 0: plan = self._initialize_task(messages) - self.output_callback(f'-- Plan: {plan} --', sender="bot") + self.output_callback(f'-- Plan: {plan} --', ) # update messages with the plan messages.append({"role": "assistant", "content": plan}) else: @@ -122,13 +120,18 @@ class VLMOrchestratedAgent: f'
{updated_ledger}
' f' ' f'', - sender="bot" ) # update messages with the ledger messages.append({"role": "assistant", "content": updated_ledger}) + self.ledger = updated_ledger self.step_count += 1 - image_base64 = parsed_screen['original_screenshot_base64'] + # save the image to the output folder + with open(f"{self.save_folder}/screenshot_{self.step_count}.png", "wb") as f: + f.write(base64.b64decode(parsed_screen['original_screenshot_base64'])) + with open(f"{self.save_folder}/som_screenshot_{self.step_count}.png", "wb") as f: + f.write(base64.b64decode(parsed_screen['som_image_base64'])) + latency_omniparser = parsed_screen['latency'] screen_info = str(parsed_screen['screen_info']) screenshot_uuid = parsed_screen['screenshot_uuid'] @@ -196,7 +199,7 @@ class VLMOrchestratedAgent: latency_vlm = time.time() - start # Update step counter with both latencies - self.output_callback(f'Step {self.step_count} | OmniParser: {latency_omniparser:.2f}s | LLM: {latency_vlm:.2f}s', sender="bot") + self.output_callback(f'Step {self.step_count} | OmniParser: {latency_omniparser:.2f}s | LLM: {latency_vlm:.2f}s', ) print(f"{vlm_response}") @@ -226,7 +229,7 @@ class VLMOrchestratedAgent: except: print(f"Error parsing: {vlm_response_json}") pass - self.output_callback(f'', sender="bot") + self.output_callback(f'', ) # Display screen info in a collapsible dropdown self.output_callback( @@ -236,7 +239,6 @@ class VLMOrchestratedAgent: f'
{screen_info}
' f' ' f'', - sender="bot" ) vlm_plan_str = "" @@ -267,6 +269,21 @@ class VLMOrchestratedAgent: name='computer', type='tool_use') response_content.append(sim_content_block) response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0)) + + # save the intermediate step trajectory to the save folder + step_trajectory = { + "screenshot_path": f"{self.save_folder}/screenshot_{self.step_count}.png", + "som_screenshot_path": f"{self.save_folder}/som_screenshot_{self.step_count}.png", + "screen_info": screen_info, + "latency_omniparser": latency_omniparser, + "latency_vlm": latency_vlm, + "vlm_response_json": vlm_response_json, + 'ledger': self.ledger, + } + with open(f"{self.save_folder}/trajectory.json", "a") as f: + f.write(json.dumps(step_trajectory)) + f.write("\n") + return response_message, vlm_response_json def _api_response_callback(self, response: APIResponse): @@ -376,9 +393,8 @@ IMPORTANT NOTES: plan = extract_data(vlm_response, "json") # Create a filename with timestamp - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - plan_filename = f"plan_{timestamp}.json" - plan_path = self.save_folder / plan_filename + plan_filename = f"plan.json" + plan_path = os.path.join(self.save_folder, plan_filename) # Save the plan to a file try: diff --git a/omnitool/gradio/app_new.py b/omnitool/gradio/app_new.py index a9e5e6d..d67ae18 100644 --- a/omnitool/gradio/app_new.py +++ b/omnitool/gradio/app_new.py @@ -1,4 +1,7 @@ """ +The app contains: +- a new UI for the OmniParser AI Agent. +- python app_new.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000 """ @@ -28,10 +31,6 @@ import base64 CONFIG_DIR = Path("~/.anthropic").expanduser() API_KEY_FILE = CONFIG_DIR / "api_key" -UPLOAD_FOLDER = Path("./uploads").absolute() - -# Create uploads directory if it doesn't exist -UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) INTRO_TEXT = '''
@@ -46,13 +45,13 @@ def parse_arguments(): parser = argparse.ArgumentParser(description="Gradio App") parser.add_argument("--windows_host_url", type=str, default='localhost:8006') parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000") - parser.add_argument("--upload_folder", type=str, default="./uploads") + parser.add_argument("--run_folder", type=str, default="./tmp/outputs") return parser.parse_args() args = parse_arguments() # Update upload folder from args if provided -UPLOAD_FOLDER = Path(args.upload_folder).absolute() -UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) +RUN_FOLDER = Path(os.path.join(args.run_folder, datetime.now().strftime('%Y%m%d_%H%M'))) +RUN_FOLDER.mkdir(parents=True, exist_ok=True) class Sender(StrEnum): USER = "user" @@ -63,8 +62,8 @@ class Sender(StrEnum): def load_existing_files(): """Load all existing files from the uploads folder""" files = [] - if UPLOAD_FOLDER.exists(): - for file_path in UPLOAD_FOLDER.iterdir(): + if RUN_FOLDER.exists(): + for file_path in RUN_FOLDER.iterdir(): if file_path.is_file(): files.append(str(file_path)) return files @@ -277,7 +276,7 @@ def process_input(user_input, state): only_n_most_recent_images=state["only_n_most_recent_images"], max_tokens=16384, omniparser_url=args.omniparser_server_url, - save_folder=str(UPLOAD_FOLDER) + save_folder=str(RUN_FOLDER) ): if loop_msg is None or state.get("stop"): # Detect and add new files to the state @@ -434,7 +433,7 @@ def handle_file_upload(files, state): for file in files: # Get the file name and create a path in the upload directory file_name = Path(file.name).name - file_path = UPLOAD_FOLDER / file_name + file_path = RUN_FOLDER / file_name # Save the file shutil.copy(file.name, file_path) @@ -471,9 +470,9 @@ def toggle_view(view_mode, file_path=None, state=None): def detect_new_files(state): """Detect new files in the uploads folder and add them to the state""" new_files_count = 0 - if UPLOAD_FOLDER.exists(): + if RUN_FOLDER.exists(): current_files = set(state['uploaded_files']) - for file_path in UPLOAD_FOLDER.iterdir(): + for file_path in RUN_FOLDER.iterdir(): if file_path.is_file(): file_path_str = str(file_path) if file_path_str not in current_files: