add drop down for omni output in the chat; add args for app.py;

2025-01-28 21:33:58 -08:00
parent 16570a9bf3
commit 7ea2239e10
10 changed files with 86 additions and 29 deletions
--- a/demo/gradio/computer_use_demo/loop.py
+++ b/demo/gradio/computer_use_demo/loop.py
@@ -52,7 +52,7 @@ def sampling_loop_sync(
    Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
    """
    print('in sampling_loop_sync, model:', model)
-    omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url and omniparser_url != "localhost:8000" else None)
+    omniparser = OmniParser(url=f"http://{omniparser_url}/send_text/" if omniparser_url else None)
    if model == "claude-3-5-sonnet-20241022":
        # Register Actor and Executor
        actor = AnthropicActor(
--- a/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
+++ b/demo/gradio/computer_use_demo/omniparser_agent/vlm_agent.py
@@ -132,7 +132,15 @@ class VLMAgent:
                             sender="bot")
        self.output_callback(f'Set of Marks Screenshot for {colorful_text_vlm}:\n<img src="data:image/png;base64,{parsed_screen["som_image_base64"]}">', sender="bot")
        screen_info = str(parsed_screen['screen_info'])
-        self.output_callback(f'Screen Info for {colorful_text_vlm}:\n{screen_info}', sender="bot")
+        # self.output_callback(f'Screen Info for {colorful_text_vlm}:\n{screen_info}', sender="bot")
+        self.output_callback(
+                    f'<details>'
+                    f'  <summary>Screen Info for {colorful_text_vlm}</summary>'
+                    f'  <pre>{screen_info}</pre>'
+                    f'</details>',
+                    sender="bot"
+                )
+

        screenshot_uuid = parsed_screen['screenshot_uuid']
        screen_width, screen_height = parsed_screen['width'], parsed_screen['height']
@@ -155,7 +163,7 @@ class VLMAgent:
            planner_messages[-1]["content"].append(f"{OUTPUT_DIR}/screenshot_{screenshot_uuid}.png")
            planner_messages[-1]["content"].append(f"{OUTPUT_DIR}/screenshot_som_{screenshot_uuid}.png")

-        print(f"Sending messages to VLMPlanner : {planner_messages}")
+        # print(f"Sending messages to VLMPlanner : {planner_messages}")
        start = time.time()
        if "gpt" in self.model:
            vlm_response, token_usage = run_oai_interleaved(