fixes

2025-02-02 16:09:54 -08:00
parent ba7ed0ac06
commit 2b621c5c0f
2 changed files with 50 additions and 56 deletions
--- a/computer_use_demo/gradio/agent/vlm_agent.py
+++ b/computer_use_demo/gradio/agent/vlm_agent.py
@@ -12,6 +12,7 @@ from anthropic.types.beta import BetaMessage, BetaTextBlock, BetaToolUseBlock, B
 from agent.llm_utils.oai import run_oai_interleaved
 from agent.llm_utils.groqclient import run_groq_interleaved
 from agent.llm_utils.utils import is_image_path
 import time
 import re
@@ -79,10 +80,8 @@ class VLMAgent:
        # drop looping actions msg, byte image etc
        planner_messages = messages
-        planner_messages = _keep_latest_images(planner_messages)
+        _remove_som_images(planner_messages)
-        # if self.only_n_most_recent_images:
+        _maybe_filter_to_n_most_recent_images(planner_messages, self.only_n_most_recent_images)
        #     _maybe_filter_to_n_most_recent_images(planner_messages, self.only_n_most_recent_images)
        # print(f"filtered_messages: {planner_messages}\n\n", "full messages:", messages)
        if isinstance(planner_messages[-1], dict):
            if not isinstance(planner_messages[-1]["content"], list):
@@ -103,9 +102,8 @@ class VLMAgent:
            )
            print(f"oai token usage: {token_usage}")
            self.total_token_usage += token_usage
-            self.total_cost += (token_usage * 0.15 / 1000000)  # https://openai.com/api/pricing/
+            self.total_cost += (token_usage * 2.5 / 1000000)  # https://openai.com/api/pricing/
        elif "r1" in self.model:
            print(f"Sending messages to Groq: {planner_messages}")
            vlm_response, token_usage = run_groq_interleaved(
                messages=planner_messages,
                system=system,
@@ -184,11 +182,10 @@ class VLMAgent:
        if vlm_response_json["Next Action"] == "None":
            print("Task paused/completed.")
        elif vlm_response_json["Next Action"] == "type":
            click_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}', input={'action': 'left_click'}, name='computer', type='tool_use')
            sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
                                        input={'action': vlm_response_json["Next Action"], 'text': vlm_response_json["value"]},
                                        name='computer', type='tool_use')
-            response_content.extend([click_block, sim_content_block])
+            response_content.append(sim_content_block)
        else:
            sim_content_block = BetaToolUseBlock(id=f'toolu_{uuid.uuid4()}',
                                            input={'action': vlm_response_json["Next Action"]},
@@ -212,16 +209,16 @@ You should carefully consider your plan base on the task, screenshot, and histor
 Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}
 Your available "Next Action" only include:
- type: move mouse to box id, left clicks and types a string of text.
+- type: types a string of text.
- left_click: move mouse to box id and left clicks
+- left_click: move mouse to box id and left clicks.
- right_click: move mouse to box id and right clicks
+- right_click: move mouse to box id and right clicks.
- double_click: move mouse to box id and double clicks
+- double_click: move mouse to box id and double clicks.
- hover: move mouse to box id
+- hover: move mouse to box id.
 - scroll_up: scrolls the screen up.
 - scroll_down: scrolls the screen down.
 - wait: waits for 1 second for the device to load or respond.
-Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on, and the value (if the action is 'type') in order to complete the task.
+Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is not 'type', 'hover', 'scroll_up', 'scroll_down', 'wait'), and the value (if the action is 'type') in order to complete the task.
 Output format:
 ```json
@@ -275,14 +272,14 @@ IMPORTANT NOTES:
        return main_section
-def _keep_latest_images(messages):
+def _remove_som_images(messages):
-    for i in range(len(messages)-1):
+    for msg in messages:
-        if isinstance(messages[i]["content"], list):
+        msg_content = msg["content"]
-            for cnt in messages[i]["content"]:
+        if isinstance(msg_content, list):
-                if isinstance(cnt, str):
+            msg["content"] = [
-                    if cnt.endswith((".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")):
+                cnt for cnt in msg_content 
-                        messages[i]["content"].remove(cnt)
+                if not (isinstance(cnt, str) and 'som' in cnt and is_image_path(cnt))
-    return messages
+            ]
 def _maybe_filter_to_n_most_recent_images(
@@ -293,42 +290,43 @@ def _maybe_filter_to_n_most_recent_images(
    """
    With the assumption that images are screenshots that are of diminishing value as
    the conversation progresses, remove all but the final `images_to_keep` tool_result
-    images in place, with a chunk of min_removal_threshold to reduce the amount we
+    images in place
    break the implicit prompt cache.
    """
    if images_to_keep is None:
        return messages
-    tool_result_blocks = cast(
+    total_images = 0
-        list[ToolResultBlockParam],
+    for msg in messages:
-        [
+        for cnt in msg.get("content", []):
-            item
+            if isinstance(cnt, str) and is_image_path(cnt):
-            for message in messages
+                total_images += 1
-            for item in (
+            elif isinstance(cnt, dict) and cnt.get("type") == "tool_result":
-                message["content"] if isinstance(message["content"], list) else []
+                for content in cnt.get("content", []):
-            )
+                    if isinstance(content, dict) and content.get("type") == "image":
-            if isinstance(item, dict) and item.get("type") == "tool_result"
+                        total_images += 1
        ],
    )
    total_images = sum(
        1
        for tool_result in tool_result_blocks
        for content in tool_result.get("content", [])
        if isinstance(content, dict) and content.get("type") == "image"
    )
    images_to_remove = total_images - images_to_keep
    # for better cache behavior, we want to remove in chunks
    images_to_remove -= images_to_remove % min_removal_threshold
-    for tool_result in tool_result_blocks:
+    for msg in messages:
-        if isinstance(tool_result.get("content"), list):
+        msg_content = msg["content"]
        if isinstance(msg_content, list):
            new_content = []
-            for content in tool_result.get("content", []):
+            for cnt in msg_content:
-                if isinstance(content, dict) and content.get("type") == "image":
+                # Remove images from SOM or screenshot as needed
                if isinstance(cnt, str) and is_image_path(cnt):
                    if images_to_remove > 0:
                        images_to_remove -= 1
                        continue
-                new_content.append(content)
+                # VLM shouldn't use anthropic screenshot tool so shouldn't have these but in case it does, remove as needed
-            tool_result["content"] = new_content
+                elif isinstance(cnt, dict) and cnt.get("type") == "tool_result":
                    new_tool_result_content = []
                    for tool_result_entry in cnt.get("content", []):
                        if isinstance(tool_result_entry, dict) and tool_result_entry.get("type") == "image":
                            if images_to_remove > 0:
                                images_to_remove -= 1
                                continue
                        new_tool_result_content.append(tool_result_entry)
                    cnt["content"] = new_tool_result_content
                # Append fixed content to current message's content list
                new_content.append(cnt)
            msg["content"] = new_content
--- a/computer_use_demo/gradio/tools/computer.py
+++ b/computer_use_demo/gradio/tools/computer.py
@@ -170,8 +170,7 @@ class ComputerTool(BaseAnthropicTool):
                return ToolResult(output=f"Pressed keys: {text}")
            elif action == "type":
-                self.send_to_vm(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")  # Convert ms to seconds
+                self.send_to_vm(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
                self.send_to_vm("pyautogui.press('enter')")
                screenshot_base64 = (await self.screenshot()).base64_image
                return ToolResult(output=text, base64_image=screenshot_base64)
@@ -261,11 +260,8 @@ class ComputerTool(BaseAnthropicTool):
        width, height = self.target_dimension["width"], self.target_dimension["height"]
        screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
        time.sleep(0.7) # avoid async error as actions take time to complete
        # return ToolResult()
        return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
        raise ToolError(f"Failed to take screenshot: {path} does not exist.")
    def padding_image(self, screenshot):
        """Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
        _, height = screenshot.size