supprt local data logging
This commit is contained in:
@@ -3,6 +3,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="imgs/logo.png" alt="Logo">
|
<img src="imgs/logo.png" alt="Logo">
|
||||||
</p>
|
</p>
|
||||||
|
<!-- <a href="https://trendshift.io/repositories/12975" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12975" alt="microsoft%2FOmniParser | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> -->
|
||||||
|
|
||||||
[](https://arxiv.org/abs/2408.00203)
|
[](https://arxiv.org/abs/2408.00203)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
@@ -12,6 +13,7 @@
|
|||||||
**OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface.
|
**OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface.
|
||||||
|
|
||||||
## News
|
## News
|
||||||
|
- [2025/3] We support local logging of trajecotry so that you can use OmniParser+OmniTool to build training data pipeline for your favorate agent in your domain. [Documentation WIP]
|
||||||
- [2025/3] We are gradually adding multi agents orchstration and improving user interface in OmniTool for better experience.
|
- [2025/3] We are gradually adding multi agents orchstration and improving user interface in OmniTool for better experience.
|
||||||
- [2025/2] We release OmniParser V2 [checkpoints](https://huggingface.co/microsoft/OmniParser-v2.0). [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EWXbVESKWo9Buu6OYCwg06wBeoM97C6EOTG6RjvWLEN1Qg?e=alnHGC)
|
- [2025/2] We release OmniParser V2 [checkpoints](https://huggingface.co/microsoft/OmniParser-v2.0). [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EWXbVESKWo9Buu6OYCwg06wBeoM97C6EOTG6RjvWLEN1Qg?e=alnHGC)
|
||||||
- [2025/2] We introduce OmniTool: Control a Windows 11 VM with OmniParser + your vision model of choice. OmniTool supports out of the box the following large language models - OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use. [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EehZ7RzY69ZHn-MeQHrnnR4BCj3by-cLLpUVlxMjF4O65Q?e=8LxMgX)
|
- [2025/2] We introduce OmniTool: Control a Windows 11 VM with OmniParser + your vision model of choice. OmniTool supports out of the box the following large language models - OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use. [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EehZ7RzY69ZHn-MeQHrnnR4BCj3by-cLLpUVlxMjF4O65Q?e=8LxMgX)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from agent.llm_utils.groqclient import run_groq_interleaved
|
|||||||
from agent.llm_utils.utils import is_image_path
|
from agent.llm_utils.utils import is_image_path
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
OUTPUT_DIR = "./tmp/outputs"
|
OUTPUT_DIR = "./tmp/outputs"
|
||||||
ORCHESTRATOR_LEDGER_PROMPT = """
|
ORCHESTRATOR_LEDGER_PROMPT = """
|
||||||
Recall we are working on the following request:
|
Recall we are working on the following request:
|
||||||
@@ -73,7 +73,7 @@ class VLMOrchestratedAgent:
|
|||||||
max_tokens: int = 4096,
|
max_tokens: int = 4096,
|
||||||
only_n_most_recent_images: int | None = None,
|
only_n_most_recent_images: int | None = None,
|
||||||
print_usage: bool = True,
|
print_usage: bool = True,
|
||||||
save_folder: str = "./uploads",
|
save_folder: str = None,
|
||||||
):
|
):
|
||||||
if model == "omniparser + gpt-4o" or model == "omniparser + gpt-4o-orchestrated":
|
if model == "omniparser + gpt-4o" or model == "omniparser + gpt-4o-orchestrated":
|
||||||
self.model = "gpt-4o-2024-11-20"
|
self.model = "gpt-4o-2024-11-20"
|
||||||
@@ -95,22 +95,20 @@ class VLMOrchestratedAgent:
|
|||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
self.only_n_most_recent_images = only_n_most_recent_images
|
self.only_n_most_recent_images = only_n_most_recent_images
|
||||||
self.output_callback = output_callback
|
self.output_callback = output_callback
|
||||||
self.save_folder = Path(save_folder).absolute()
|
self.save_folder = save_folder
|
||||||
|
|
||||||
# Create save folder if it doesn't exist
|
|
||||||
self.save_folder.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
self.print_usage = print_usage
|
self.print_usage = print_usage
|
||||||
self.total_token_usage = 0
|
self.total_token_usage = 0
|
||||||
self.total_cost = 0
|
self.total_cost = 0
|
||||||
self.step_count = 0
|
self.step_count = 0
|
||||||
|
self.plan, self.ledger = None, None
|
||||||
|
|
||||||
self.system = ''
|
self.system = ''
|
||||||
|
|
||||||
def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
|
def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
|
||||||
if self.step_count == 0:
|
if self.step_count == 0:
|
||||||
plan = self._initialize_task(messages)
|
plan = self._initialize_task(messages)
|
||||||
self.output_callback(f'-- Plan: {plan} --', sender="bot")
|
self.output_callback(f'-- Plan: {plan} --', )
|
||||||
# update messages with the plan
|
# update messages with the plan
|
||||||
messages.append({"role": "assistant", "content": plan})
|
messages.append({"role": "assistant", "content": plan})
|
||||||
else:
|
else:
|
||||||
@@ -122,13 +120,18 @@ class VLMOrchestratedAgent:
|
|||||||
f' <pre>{updated_ledger}</pre>'
|
f' <pre>{updated_ledger}</pre>'
|
||||||
f' </div>'
|
f' </div>'
|
||||||
f'</details>',
|
f'</details>',
|
||||||
sender="bot"
|
|
||||||
)
|
)
|
||||||
# update messages with the ledger
|
# update messages with the ledger
|
||||||
messages.append({"role": "assistant", "content": updated_ledger})
|
messages.append({"role": "assistant", "content": updated_ledger})
|
||||||
|
self.ledger = updated_ledger
|
||||||
|
|
||||||
self.step_count += 1
|
self.step_count += 1
|
||||||
image_base64 = parsed_screen['original_screenshot_base64']
|
# save the image to the output folder
|
||||||
|
with open(f"{self.save_folder}/screenshot_{self.step_count}.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(parsed_screen['original_screenshot_base64']))
|
||||||
|
with open(f"{self.save_folder}/som_screenshot_{self.step_count}.png", "wb") as f:
|
||||||
|
f.write(base64.b64decode(parsed_screen['som_image_base64']))
|
||||||
|
|
||||||
latency_omniparser = parsed_screen['latency']
|
latency_omniparser = parsed_screen['latency']
|
||||||
screen_info = str(parsed_screen['screen_info'])
|
screen_info = str(parsed_screen['screen_info'])
|
||||||
screenshot_uuid = parsed_screen['screenshot_uuid']
|
screenshot_uuid = parsed_screen['screenshot_uuid']
|
||||||
@@ -196,7 +199,7 @@ class VLMOrchestratedAgent:
|
|||||||
latency_vlm = time.time() - start
|
latency_vlm = time.time() - start
|
||||||
|
|
||||||
# Update step counter with both latencies
|
# Update step counter with both latencies
|
||||||
self.output_callback(f'<i>Step {self.step_count} | OmniParser: {latency_omniparser:.2f}s | LLM: {latency_vlm:.2f}s</i>', sender="bot")
|
self.output_callback(f'<i>Step {self.step_count} | OmniParser: {latency_omniparser:.2f}s | LLM: {latency_vlm:.2f}s</i>', )
|
||||||
|
|
||||||
print(f"{vlm_response}")
|
print(f"{vlm_response}")
|
||||||
|
|
||||||
@@ -226,7 +229,7 @@ class VLMOrchestratedAgent:
|
|||||||
except:
|
except:
|
||||||
print(f"Error parsing: {vlm_response_json}")
|
print(f"Error parsing: {vlm_response_json}")
|
||||||
pass
|
pass
|
||||||
self.output_callback(f'<img src="data:image/png;base64,{img_to_show_base64}">', sender="bot")
|
self.output_callback(f'<img src="data:image/png;base64,{img_to_show_base64}">', )
|
||||||
|
|
||||||
# Display screen info in a collapsible dropdown
|
# Display screen info in a collapsible dropdown
|
||||||
self.output_callback(
|
self.output_callback(
|
||||||
@@ -236,7 +239,6 @@ class VLMOrchestratedAgent:
|
|||||||
f' <pre>{screen_info}</pre>'
|
f' <pre>{screen_info}</pre>'
|
||||||
f' </div>'
|
f' </div>'
|
||||||
f'</details>',
|
f'</details>',
|
||||||
sender="bot"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
vlm_plan_str = ""
|
vlm_plan_str = ""
|
||||||
@@ -267,6 +269,21 @@ class VLMOrchestratedAgent:
|
|||||||
name='computer', type='tool_use')
|
name='computer', type='tool_use')
|
||||||
response_content.append(sim_content_block)
|
response_content.append(sim_content_block)
|
||||||
response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0))
|
response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0))
|
||||||
|
|
||||||
|
# save the intermediate step trajectory to the save folder
|
||||||
|
step_trajectory = {
|
||||||
|
"screenshot_path": f"{self.save_folder}/screenshot_{self.step_count}.png",
|
||||||
|
"som_screenshot_path": f"{self.save_folder}/som_screenshot_{self.step_count}.png",
|
||||||
|
"screen_info": screen_info,
|
||||||
|
"latency_omniparser": latency_omniparser,
|
||||||
|
"latency_vlm": latency_vlm,
|
||||||
|
"vlm_response_json": vlm_response_json,
|
||||||
|
'ledger': self.ledger,
|
||||||
|
}
|
||||||
|
with open(f"{self.save_folder}/trajectory.json", "a") as f:
|
||||||
|
f.write(json.dumps(step_trajectory))
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
return response_message, vlm_response_json
|
return response_message, vlm_response_json
|
||||||
|
|
||||||
def _api_response_callback(self, response: APIResponse):
|
def _api_response_callback(self, response: APIResponse):
|
||||||
@@ -376,9 +393,8 @@ IMPORTANT NOTES:
|
|||||||
plan = extract_data(vlm_response, "json")
|
plan = extract_data(vlm_response, "json")
|
||||||
|
|
||||||
# Create a filename with timestamp
|
# Create a filename with timestamp
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
plan_filename = f"plan.json"
|
||||||
plan_filename = f"plan_{timestamp}.json"
|
plan_path = os.path.join(self.save_folder, plan_filename)
|
||||||
plan_path = self.save_folder / plan_filename
|
|
||||||
|
|
||||||
# Save the plan to a file
|
# Save the plan to a file
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
|
The app contains:
|
||||||
|
- a new UI for the OmniParser AI Agent.
|
||||||
|
-
|
||||||
python app_new.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000
|
python app_new.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -28,10 +31,6 @@ import base64
|
|||||||
|
|
||||||
CONFIG_DIR = Path("~/.anthropic").expanduser()
|
CONFIG_DIR = Path("~/.anthropic").expanduser()
|
||||||
API_KEY_FILE = CONFIG_DIR / "api_key"
|
API_KEY_FILE = CONFIG_DIR / "api_key"
|
||||||
UPLOAD_FOLDER = Path("./uploads").absolute()
|
|
||||||
|
|
||||||
# Create uploads directory if it doesn't exist
|
|
||||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
INTRO_TEXT = '''
|
INTRO_TEXT = '''
|
||||||
<div style="text-align: center; margin-bottom: 10px;">
|
<div style="text-align: center; margin-bottom: 10px;">
|
||||||
@@ -46,13 +45,13 @@ def parse_arguments():
|
|||||||
parser = argparse.ArgumentParser(description="Gradio App")
|
parser = argparse.ArgumentParser(description="Gradio App")
|
||||||
parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
|
parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
|
||||||
parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
|
parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
|
||||||
parser.add_argument("--upload_folder", type=str, default="./uploads")
|
parser.add_argument("--run_folder", type=str, default="./tmp/outputs")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
|
|
||||||
# Update upload folder from args if provided
|
# Update upload folder from args if provided
|
||||||
UPLOAD_FOLDER = Path(args.upload_folder).absolute()
|
RUN_FOLDER = Path(os.path.join(args.run_folder, datetime.now().strftime('%Y%m%d_%H%M')))
|
||||||
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
|
RUN_FOLDER.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
class Sender(StrEnum):
|
class Sender(StrEnum):
|
||||||
USER = "user"
|
USER = "user"
|
||||||
@@ -63,8 +62,8 @@ class Sender(StrEnum):
|
|||||||
def load_existing_files():
|
def load_existing_files():
|
||||||
"""Load all existing files from the uploads folder"""
|
"""Load all existing files from the uploads folder"""
|
||||||
files = []
|
files = []
|
||||||
if UPLOAD_FOLDER.exists():
|
if RUN_FOLDER.exists():
|
||||||
for file_path in UPLOAD_FOLDER.iterdir():
|
for file_path in RUN_FOLDER.iterdir():
|
||||||
if file_path.is_file():
|
if file_path.is_file():
|
||||||
files.append(str(file_path))
|
files.append(str(file_path))
|
||||||
return files
|
return files
|
||||||
@@ -277,7 +276,7 @@ def process_input(user_input, state):
|
|||||||
only_n_most_recent_images=state["only_n_most_recent_images"],
|
only_n_most_recent_images=state["only_n_most_recent_images"],
|
||||||
max_tokens=16384,
|
max_tokens=16384,
|
||||||
omniparser_url=args.omniparser_server_url,
|
omniparser_url=args.omniparser_server_url,
|
||||||
save_folder=str(UPLOAD_FOLDER)
|
save_folder=str(RUN_FOLDER)
|
||||||
):
|
):
|
||||||
if loop_msg is None or state.get("stop"):
|
if loop_msg is None or state.get("stop"):
|
||||||
# Detect and add new files to the state
|
# Detect and add new files to the state
|
||||||
@@ -434,7 +433,7 @@ def handle_file_upload(files, state):
|
|||||||
for file in files:
|
for file in files:
|
||||||
# Get the file name and create a path in the upload directory
|
# Get the file name and create a path in the upload directory
|
||||||
file_name = Path(file.name).name
|
file_name = Path(file.name).name
|
||||||
file_path = UPLOAD_FOLDER / file_name
|
file_path = RUN_FOLDER / file_name
|
||||||
|
|
||||||
# Save the file
|
# Save the file
|
||||||
shutil.copy(file.name, file_path)
|
shutil.copy(file.name, file_path)
|
||||||
@@ -471,9 +470,9 @@ def toggle_view(view_mode, file_path=None, state=None):
|
|||||||
def detect_new_files(state):
|
def detect_new_files(state):
|
||||||
"""Detect new files in the uploads folder and add them to the state"""
|
"""Detect new files in the uploads folder and add them to the state"""
|
||||||
new_files_count = 0
|
new_files_count = 0
|
||||||
if UPLOAD_FOLDER.exists():
|
if RUN_FOLDER.exists():
|
||||||
current_files = set(state['uploaded_files'])
|
current_files = set(state['uploaded_files'])
|
||||||
for file_path in UPLOAD_FOLDER.iterdir():
|
for file_path in RUN_FOLDER.iterdir():
|
||||||
if file_path.is_file():
|
if file_path.is_file():
|
||||||
file_path_str = str(file_path)
|
file_path_str = str(file_path)
|
||||||
if file_path_str not in current_files:
|
if file_path_str not in current_files:
|
||||||
|
|||||||
Reference in New Issue
Block a user